[llvm] 1ddc51d - Inliner: don't mark call sites as 'nounwind' if that would be redundant
Nicolai Hähnle via llvm-commits
llvm-commits at lists.llvm.org
Wed Jul 20 05:17:39 PDT 2022
Author: Nicolai Hähnle
Date: 2022-07-20T14:17:23+02:00
New Revision: 1ddc51d89d9d3a3f77d6ca47514a776641dc3e4e
URL: https://github.com/llvm/llvm-project/commit/1ddc51d89d9d3a3f77d6ca47514a776641dc3e4e
DIFF: https://github.com/llvm/llvm-project/commit/1ddc51d89d9d3a3f77d6ca47514a776641dc3e4e.diff
LOG: Inliner: don't mark call sites as 'nounwind' if that would be redundant
When F calls G calls H, G is nounwind, and G is inlined into F, then the
inlined call-site to H should be effectively nounwind so as not to lose
information during inlining.
If H itself is nounwind (which often happens when H is an intrinsic), we
no longer mark the callsite explicitly as nounwind. Previously, there
were cases where the inlined call-site of H differs from a pre-existing
call-site of H in F *only* in the explicitly added nounwind attribute,
thus preventing common subexpression elimination.
v2:
- just check CI->doesNotThrow
v3 (resubmit after revert at 344378808778c61d5599f4e0ac783ef7e6f8ed05):
- update Clang tests
Differential Revision: https://reviews.llvm.org/D129860
Added:
Modified:
clang/test/CodeGen/PowerPC/builtins-ppc-ld-st-rmb.c
clang/test/CodeGen/PowerPC/builtins-ppc-quadword-noi128.c
clang/test/CodeGen/X86/keylocker.c
clang/test/CodeGen/aarch64-bf16-dotprod-intrinsics.c
clang/test/CodeGen/aarch64-ls64.c
clang/test/CodeGen/aarch64-neon-2velem.c
clang/test/CodeGen/aarch64-neon-across.c
clang/test/CodeGen/aarch64-neon-fcvt-intrinsics.c
clang/test/CodeGen/aarch64-neon-fma.c
clang/test/CodeGen/aarch64-neon-fp16fml.c
clang/test/CodeGen/aarch64-neon-tbl.c
clang/test/CodeGen/aarch64-poly128.c
clang/test/CodeGen/aarch64-v8.1a-neon-intrinsics.c
clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
clang/test/CodeGen/arm-bf16-convert-intrinsics.c
clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c
clang/test/CodeGen/arm-neon-fma.c
clang/test/CodeGen/arm-neon-numeric-maxmin.c
clang/test/CodeGen/arm-neon-vcvtX.c
clang/test/CodeGen/arm-v8.1a-neon-intrinsics.c
clang/test/CodeGen/arm_acle.c
clang/test/CodeGen/memcpy-inline-builtin.c
clang/test/Headers/wasm.c
clang/test/OpenMP/cancel_codegen.cpp
clang/test/OpenMP/cancellation_point_codegen.cpp
clang/test/OpenMP/distribute_parallel_for_reduction_task_codegen.cpp
clang/test/OpenMP/for_reduction_task_codegen.cpp
clang/test/OpenMP/master_taskloop_in_reduction_codegen.cpp
clang/test/OpenMP/master_taskloop_simd_in_reduction_codegen.cpp
clang/test/OpenMP/parallel_for_reduction_task_codegen.cpp
clang/test/OpenMP/parallel_master_reduction_task_codegen.cpp
clang/test/OpenMP/parallel_master_taskloop_codegen.cpp
clang/test/OpenMP/parallel_master_taskloop_lastprivate_codegen.cpp
clang/test/OpenMP/parallel_master_taskloop_simd_lastprivate_codegen.cpp
clang/test/OpenMP/parallel_reduction_task_codegen.cpp
clang/test/OpenMP/parallel_sections_reduction_task_codegen.cpp
clang/test/OpenMP/sections_reduction_task_codegen.cpp
clang/test/OpenMP/target_in_reduction_codegen.cpp
clang/test/OpenMP/target_parallel_codegen.cpp
clang/test/OpenMP/target_parallel_for_codegen.cpp
clang/test/OpenMP/target_parallel_for_reduction_task_codegen.cpp
clang/test/OpenMP/target_parallel_for_simd_codegen.cpp
clang/test/OpenMP/target_parallel_reduction_task_codegen.cpp
clang/test/OpenMP/target_teams_codegen.cpp
clang/test/OpenMP/target_teams_distribute_codegen.cpp
clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_task_codegen.cpp
clang/test/OpenMP/target_teams_distribute_simd_codegen.cpp
clang/test/OpenMP/task_codegen.cpp
clang/test/OpenMP/task_if_codegen.cpp
clang/test/OpenMP/task_in_reduction_codegen.cpp
clang/test/OpenMP/taskloop_in_reduction_codegen.cpp
clang/test/OpenMP/taskloop_simd_in_reduction_codegen.cpp
clang/test/OpenMP/teams_distribute_parallel_for_reduction_task_codegen.cpp
llvm/lib/Transforms/Utils/InlineFunction.cpp
llvm/test/Transforms/Coroutines/coro-retcon-once-value.ll
llvm/test/Transforms/Inline/noalias-calls-always.ll
llvm/test/Transforms/Inline/noalias-calls.ll
llvm/test/Transforms/Inline/noalias-cs.ll
llvm/test/Transforms/Inline/noalias2.ll
llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
llvm/test/Transforms/PhaseOrdering/deletion-of-loops-that-became-side-effect-free.ll
Removed:
################################################################################
diff --git a/clang/test/CodeGen/PowerPC/builtins-ppc-ld-st-rmb.c b/clang/test/CodeGen/PowerPC/builtins-ppc-ld-st-rmb.c
index 627f86c02557f..36a36c2b38daf 100644
--- a/clang/test/CodeGen/PowerPC/builtins-ppc-ld-st-rmb.c
+++ b/clang/test/CodeGen/PowerPC/builtins-ppc-ld-st-rmb.c
@@ -55,21 +55,21 @@
// BE-PWR9-NEXT: [[TMP1:%.*]] = load i8*, i8** [[__A_ADDR_I]], align 8
// BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, i64* [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP2]], 56
-// BE-PWR9-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.ppc.vsx.lxvll(i8* [[TMP1]], i64 [[SHL_I]]) #[[ATTR4:[0-9]+]]
+// BE-PWR9-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.ppc.vsx.lxvll(i8* [[TMP1]], i64 [[SHL_I]])
// BE-PWR9-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
// BE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT: [[TMP5:%.*]] = load i64, i64* [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP5]]
// BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// BE-PWR9-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// BE-PWR9-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(i8* [[TMP6]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(i8* [[TMP6]])
// BE-PWR9-NEXT: store <16 x i8> [[TMP7]], <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
// BE-PWR9-NEXT: [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
// BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// BE-PWR9-NEXT: [[TMP13:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP9]], <4 x i32> [[TMP11]], <16 x i8> [[TMP12]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP13:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP9]], <4 x i32> [[TMP11]], <16 x i8> [[TMP12]])
// BE-PWR9-NEXT: [[TMP14:%.*]] = bitcast <4 x i32> [[TMP13]] to <16 x i8>
// BE-PWR9-NEXT: ret <16 x i8> [[TMP14]]
//
@@ -87,21 +87,21 @@
// LE-PWR9-NEXT: [[TMP1:%.*]] = load i8*, i8** [[__A_ADDR_I]], align 8
// LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, i64* [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP2]], 56
-// LE-PWR9-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.ppc.vsx.lxvll(i8* [[TMP1]], i64 [[SHL_I]]) #[[ATTR4:[0-9]+]]
+// LE-PWR9-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.ppc.vsx.lxvll(i8* [[TMP1]], i64 [[SHL_I]])
// LE-PWR9-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
// LE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT: [[TMP5:%.*]] = load i64, i64* [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP5]]
// LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// LE-PWR9-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// LE-PWR9-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(i8* [[TMP6]]) #[[ATTR4]]
+// LE-PWR9-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(i8* [[TMP6]])
// LE-PWR9-NEXT: store <16 x i8> [[TMP7]], <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
// LE-PWR9-NEXT: [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
// LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// LE-PWR9-NEXT: [[TMP13:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP9]], <4 x i32> [[TMP11]], <16 x i8> [[TMP12]]) #[[ATTR4]]
+// LE-PWR9-NEXT: [[TMP13:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP9]], <4 x i32> [[TMP11]], <16 x i8> [[TMP12]])
// LE-PWR9-NEXT: [[TMP14:%.*]] = bitcast <4 x i32> [[TMP13]] to <16 x i8>
// LE-PWR9-NEXT: ret <16 x i8> [[TMP14]]
//
@@ -167,14 +167,14 @@ vector unsigned char test_ldrmb1(char *ptr) { return __vec_ldrmb(ptr, 1); }
// BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// BE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// BE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@@ -182,7 +182,7 @@ vector unsigned char test_ldrmb1(char *ptr) { return __vec_ldrmb(ptr, 1); }
// BE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// BE-PWR9-NEXT: ret void
//
// LE-PWR9-LABEL: @test_strmb1(
@@ -205,14 +205,14 @@ vector unsigned char test_ldrmb1(char *ptr) { return __vec_ldrmb(ptr, 1); }
// LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// LE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// LE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@@ -220,7 +220,7 @@ vector unsigned char test_ldrmb1(char *ptr) { return __vec_ldrmb(ptr, 1); }
// LE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// LE-PWR9-NEXT: ret void
//
// BE32-PWR9-LABEL: @test_strmb1(
@@ -291,14 +291,14 @@ void test_strmb1(char *ptr, vector unsigned char data) {
// BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// BE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// BE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@@ -306,7 +306,7 @@ void test_strmb1(char *ptr, vector unsigned char data) {
// BE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// BE-PWR9-NEXT: ret void
//
// LE-PWR9-LABEL: @test_strmb2(
@@ -329,14 +329,14 @@ void test_strmb1(char *ptr, vector unsigned char data) {
// LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// LE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// LE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@@ -344,7 +344,7 @@ void test_strmb1(char *ptr, vector unsigned char data) {
// LE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// LE-PWR9-NEXT: ret void
//
// BE32-PWR9-LABEL: @test_strmb2(
@@ -423,14 +423,14 @@ void test_strmb2(char *ptr, vector unsigned char data) {
// BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// BE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// BE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@@ -438,7 +438,7 @@ void test_strmb2(char *ptr, vector unsigned char data) {
// BE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// BE-PWR9-NEXT: ret void
//
// LE-PWR9-LABEL: @test_strmb3(
@@ -461,14 +461,14 @@ void test_strmb2(char *ptr, vector unsigned char data) {
// LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// LE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// LE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@@ -476,7 +476,7 @@ void test_strmb2(char *ptr, vector unsigned char data) {
// LE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// LE-PWR9-NEXT: ret void
//
// BE32-PWR9-LABEL: @test_strmb3(
@@ -552,14 +552,14 @@ void test_strmb3(char *ptr, vector unsigned char data) {
// BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// BE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// BE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@@ -567,7 +567,7 @@ void test_strmb3(char *ptr, vector unsigned char data) {
// BE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// BE-PWR9-NEXT: ret void
//
// LE-PWR9-LABEL: @test_strmb4(
@@ -590,14 +590,14 @@ void test_strmb3(char *ptr, vector unsigned char data) {
// LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// LE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// LE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@@ -605,7 +605,7 @@ void test_strmb3(char *ptr, vector unsigned char data) {
// LE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// LE-PWR9-NEXT: ret void
//
// BE32-PWR9-LABEL: @test_strmb4(
@@ -684,14 +684,14 @@ void test_strmb4(char *ptr, vector unsigned char data) {
// BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// BE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// BE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@@ -699,7 +699,7 @@ void test_strmb4(char *ptr, vector unsigned char data) {
// BE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// BE-PWR9-NEXT: ret void
//
// LE-PWR9-LABEL: @test_strmb5(
@@ -722,14 +722,14 @@ void test_strmb4(char *ptr, vector unsigned char data) {
// LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// LE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// LE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@@ -737,7 +737,7 @@ void test_strmb4(char *ptr, vector unsigned char data) {
// LE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// LE-PWR9-NEXT: ret void
//
// BE32-PWR9-LABEL: @test_strmb5(
@@ -824,14 +824,14 @@ void test_strmb5(char *ptr, vector unsigned char data) {
// BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// BE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// BE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@@ -839,7 +839,7 @@ void test_strmb5(char *ptr, vector unsigned char data) {
// BE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// BE-PWR9-NEXT: ret void
//
// LE-PWR9-LABEL: @test_strmb6(
@@ -862,14 +862,14 @@ void test_strmb5(char *ptr, vector unsigned char data) {
// LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// LE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// LE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@@ -877,7 +877,7 @@ void test_strmb5(char *ptr, vector unsigned char data) {
// LE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// LE-PWR9-NEXT: ret void
//
// BE32-PWR9-LABEL: @test_strmb6(
@@ -972,14 +972,14 @@ void test_strmb6(char *ptr, vector unsigned char data) {
// BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// BE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// BE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@@ -987,7 +987,7 @@ void test_strmb6(char *ptr, vector unsigned char data) {
// BE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// BE-PWR9-NEXT: ret void
//
// LE-PWR9-LABEL: @test_strmb7(
@@ -1010,14 +1010,14 @@ void test_strmb6(char *ptr, vector unsigned char data) {
// LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// LE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// LE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@@ -1025,7 +1025,7 @@ void test_strmb6(char *ptr, vector unsigned char data) {
// LE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// LE-PWR9-NEXT: ret void
//
// BE32-PWR9-LABEL: @test_strmb7(
@@ -1106,14 +1106,14 @@ void test_strmb7(char *ptr, vector unsigned char data) {
// BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// BE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// BE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@@ -1121,7 +1121,7 @@ void test_strmb7(char *ptr, vector unsigned char data) {
// BE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// BE-PWR9-NEXT: ret void
//
// LE-PWR9-LABEL: @test_strmb8(
@@ -1144,14 +1144,14 @@ void test_strmb7(char *ptr, vector unsigned char data) {
// LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// LE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// LE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@@ -1159,7 +1159,7 @@ void test_strmb7(char *ptr, vector unsigned char data) {
// LE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// LE-PWR9-NEXT: ret void
//
// BE32-PWR9-LABEL: @test_strmb8(
@@ -1222,21 +1222,21 @@ void test_strmb8(char *ptr, vector unsigned char data) {
// BE-PWR9-NEXT: [[TMP1:%.*]] = load i8*, i8** [[__A_ADDR_I]], align 8
// BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, i64* [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP2]], 56
-// BE-PWR9-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.ppc.vsx.lxvll(i8* [[TMP1]], i64 [[SHL_I]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.ppc.vsx.lxvll(i8* [[TMP1]], i64 [[SHL_I]])
// BE-PWR9-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
// BE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT: [[TMP5:%.*]] = load i64, i64* [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP5]]
// BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// BE-PWR9-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// BE-PWR9-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(i8* [[TMP6]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(i8* [[TMP6]])
// BE-PWR9-NEXT: store <16 x i8> [[TMP7]], <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
// BE-PWR9-NEXT: [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
// BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// BE-PWR9-NEXT: [[TMP13:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP9]], <4 x i32> [[TMP11]], <16 x i8> [[TMP12]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP13:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP9]], <4 x i32> [[TMP11]], <16 x i8> [[TMP12]])
// BE-PWR9-NEXT: [[TMP14:%.*]] = bitcast <4 x i32> [[TMP13]] to <16 x i8>
// BE-PWR9-NEXT: ret <16 x i8> [[TMP14]]
//
@@ -1254,21 +1254,21 @@ void test_strmb8(char *ptr, vector unsigned char data) {
// LE-PWR9-NEXT: [[TMP1:%.*]] = load i8*, i8** [[__A_ADDR_I]], align 8
// LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, i64* [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP2]], 56
-// LE-PWR9-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.ppc.vsx.lxvll(i8* [[TMP1]], i64 [[SHL_I]]) #[[ATTR4]]
+// LE-PWR9-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.ppc.vsx.lxvll(i8* [[TMP1]], i64 [[SHL_I]])
// LE-PWR9-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
// LE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT: [[TMP5:%.*]] = load i64, i64* [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP5]]
// LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// LE-PWR9-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// LE-PWR9-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(i8* [[TMP6]]) #[[ATTR4]]
+// LE-PWR9-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(i8* [[TMP6]])
// LE-PWR9-NEXT: store <16 x i8> [[TMP7]], <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
// LE-PWR9-NEXT: [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
// LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// LE-PWR9-NEXT: [[TMP13:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP9]], <4 x i32> [[TMP11]], <16 x i8> [[TMP12]]) #[[ATTR4]]
+// LE-PWR9-NEXT: [[TMP13:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP9]], <4 x i32> [[TMP11]], <16 x i8> [[TMP12]])
// LE-PWR9-NEXT: [[TMP14:%.*]] = bitcast <4 x i32> [[TMP13]] to <16 x i8>
// LE-PWR9-NEXT: ret <16 x i8> [[TMP14]]
//
@@ -1345,14 +1345,14 @@ vector unsigned char test_ldrmb9(char *ptr) { return __vec_ldrmb(ptr, 9); }
// BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// BE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// BE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@@ -1360,7 +1360,7 @@ vector unsigned char test_ldrmb9(char *ptr) { return __vec_ldrmb(ptr, 9); }
// BE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// BE-PWR9-NEXT: ret void
//
// LE-PWR9-LABEL: @test_strmb9(
@@ -1383,14 +1383,14 @@ vector unsigned char test_ldrmb9(char *ptr) { return __vec_ldrmb(ptr, 9); }
// LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// LE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// LE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@@ -1398,7 +1398,7 @@ vector unsigned char test_ldrmb9(char *ptr) { return __vec_ldrmb(ptr, 9); }
// LE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// LE-PWR9-NEXT: ret void
//
// BE32-PWR9-LABEL: @test_strmb9(
@@ -1485,14 +1485,14 @@ void test_strmb9(char *ptr, vector unsigned char data) {
// BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// BE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// BE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@@ -1500,7 +1500,7 @@ void test_strmb9(char *ptr, vector unsigned char data) {
// BE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// BE-PWR9-NEXT: ret void
//
// LE-PWR9-LABEL: @test_strmb10(
@@ -1523,14 +1523,14 @@ void test_strmb9(char *ptr, vector unsigned char data) {
// LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// LE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// LE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@@ -1538,7 +1538,7 @@ void test_strmb9(char *ptr, vector unsigned char data) {
// LE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// LE-PWR9-NEXT: ret void
//
// BE32-PWR9-LABEL: @test_strmb10(
@@ -1633,14 +1633,14 @@ void test_strmb10(char *ptr, vector unsigned char data) {
// BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// BE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// BE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@@ -1648,7 +1648,7 @@ void test_strmb10(char *ptr, vector unsigned char data) {
// BE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// BE-PWR9-NEXT: ret void
//
// LE-PWR9-LABEL: @test_strmb11(
@@ -1671,14 +1671,14 @@ void test_strmb10(char *ptr, vector unsigned char data) {
// LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// LE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// LE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@@ -1686,7 +1686,7 @@ void test_strmb10(char *ptr, vector unsigned char data) {
// LE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// LE-PWR9-NEXT: ret void
//
// BE32-PWR9-LABEL: @test_strmb11(
@@ -1778,14 +1778,14 @@ void test_strmb11(char *ptr, vector unsigned char data) {
// BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// BE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// BE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@@ -1793,7 +1793,7 @@ void test_strmb11(char *ptr, vector unsigned char data) {
// BE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// BE-PWR9-NEXT: ret void
//
// LE-PWR9-LABEL: @test_strmb12(
@@ -1816,14 +1816,14 @@ void test_strmb11(char *ptr, vector unsigned char data) {
// LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// LE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// LE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@@ -1831,7 +1831,7 @@ void test_strmb11(char *ptr, vector unsigned char data) {
// LE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// LE-PWR9-NEXT: ret void
//
// BE32-PWR9-LABEL: @test_strmb12(
@@ -1926,14 +1926,14 @@ void test_strmb12(char *ptr, vector unsigned char data) {
// BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// BE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// BE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@@ -1941,7 +1941,7 @@ void test_strmb12(char *ptr, vector unsigned char data) {
// BE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// BE-PWR9-NEXT: ret void
//
// LE-PWR9-LABEL: @test_strmb13(
@@ -1964,14 +1964,14 @@ void test_strmb12(char *ptr, vector unsigned char data) {
// LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// LE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// LE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@@ -1979,7 +1979,7 @@ void test_strmb12(char *ptr, vector unsigned char data) {
// LE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// LE-PWR9-NEXT: ret void
//
// BE32-PWR9-LABEL: @test_strmb13(
@@ -2082,14 +2082,14 @@ void test_strmb13(char *ptr, vector unsigned char data) {
// BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// BE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// BE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@@ -2097,7 +2097,7 @@ void test_strmb13(char *ptr, vector unsigned char data) {
// BE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// BE-PWR9-NEXT: ret void
//
// LE-PWR9-LABEL: @test_strmb14(
@@ -2120,14 +2120,14 @@ void test_strmb13(char *ptr, vector unsigned char data) {
// LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// LE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// LE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@@ -2135,7 +2135,7 @@ void test_strmb13(char *ptr, vector unsigned char data) {
// LE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// LE-PWR9-NEXT: ret void
//
// BE32-PWR9-LABEL: @test_strmb14(
@@ -2246,14 +2246,14 @@ void test_strmb14(char *ptr, vector unsigned char data) {
// BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// BE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// BE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@@ -2261,7 +2261,7 @@ void test_strmb14(char *ptr, vector unsigned char data) {
// BE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// BE-PWR9-NEXT: ret void
//
// LE-PWR9-LABEL: @test_strmb15(
@@ -2284,14 +2284,14 @@ void test_strmb14(char *ptr, vector unsigned char data) {
// LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// LE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// LE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@@ -2299,7 +2299,7 @@ void test_strmb14(char *ptr, vector unsigned char data) {
// LE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// LE-PWR9-NEXT: ret void
//
// BE32-PWR9-LABEL: @test_strmb15(
@@ -2366,21 +2366,21 @@ void test_strmb15(char *ptr, vector unsigned char data) {
// BE-PWR9-NEXT: [[TMP1:%.*]] = load i8*, i8** [[__A_ADDR_I]], align 8
// BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, i64* [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP2]], 56
-// BE-PWR9-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.ppc.vsx.lxvll(i8* [[TMP1]], i64 [[SHL_I]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.ppc.vsx.lxvll(i8* [[TMP1]], i64 [[SHL_I]])
// BE-PWR9-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
// BE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT: [[TMP5:%.*]] = load i64, i64* [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP5]]
// BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// BE-PWR9-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// BE-PWR9-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(i8* [[TMP6]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(i8* [[TMP6]])
// BE-PWR9-NEXT: store <16 x i8> [[TMP7]], <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
// BE-PWR9-NEXT: [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
// BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// BE-PWR9-NEXT: [[TMP13:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP9]], <4 x i32> [[TMP11]], <16 x i8> [[TMP12]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP13:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP9]], <4 x i32> [[TMP11]], <16 x i8> [[TMP12]])
// BE-PWR9-NEXT: [[TMP14:%.*]] = bitcast <4 x i32> [[TMP13]] to <16 x i8>
// BE-PWR9-NEXT: ret <16 x i8> [[TMP14]]
//
@@ -2398,21 +2398,21 @@ void test_strmb15(char *ptr, vector unsigned char data) {
// LE-PWR9-NEXT: [[TMP1:%.*]] = load i8*, i8** [[__A_ADDR_I]], align 8
// LE-PWR9-NEXT: [[TMP2:%.*]] = load i64, i64* [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP2]], 56
-// LE-PWR9-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.ppc.vsx.lxvll(i8* [[TMP1]], i64 [[SHL_I]]) #[[ATTR4]]
+// LE-PWR9-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.ppc.vsx.lxvll(i8* [[TMP1]], i64 [[SHL_I]])
// LE-PWR9-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
// LE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT: [[TMP5:%.*]] = load i64, i64* [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP5]]
// LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// LE-PWR9-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// LE-PWR9-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(i8* [[TMP6]]) #[[ATTR4]]
+// LE-PWR9-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(i8* [[TMP6]])
// LE-PWR9-NEXT: store <16 x i8> [[TMP7]], <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
// LE-PWR9-NEXT: [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
// LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// LE-PWR9-NEXT: [[TMP13:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP9]], <4 x i32> [[TMP11]], <16 x i8> [[TMP12]]) #[[ATTR4]]
+// LE-PWR9-NEXT: [[TMP13:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP9]], <4 x i32> [[TMP11]], <16 x i8> [[TMP12]])
// LE-PWR9-NEXT: [[TMP14:%.*]] = bitcast <4 x i32> [[TMP13]] to <16 x i8>
// LE-PWR9-NEXT: ret <16 x i8> [[TMP14]]
//
@@ -2472,14 +2472,14 @@ vector unsigned char test_ldrmb16(char *ptr) { return __vec_ldrmb(ptr, 16); }
// BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// BE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// BE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@@ -2487,7 +2487,7 @@ vector unsigned char test_ldrmb16(char *ptr) { return __vec_ldrmb(ptr, 16); }
// BE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// BE-PWR9-NEXT: ret void
//
// LE-PWR9-LABEL: @test_strmb16(
@@ -2510,14 +2510,14 @@ vector unsigned char test_ldrmb16(char *ptr) { return __vec_ldrmb(ptr, 16); }
// LE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
// LE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
// LE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// LE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
// LE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
// LE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
// LE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
// LE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// LE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
// LE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
// LE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
// LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@@ -2525,7 +2525,7 @@ vector unsigned char test_ldrmb16(char *ptr) { return __vec_ldrmb(ptr, 16); }
// LE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
// LE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
// LE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// LE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
// LE-PWR9-NEXT: ret void
//
// BE32-PWR9-LABEL: @test_strmb16(
diff --git a/clang/test/CodeGen/PowerPC/builtins-ppc-quadword-noi128.c b/clang/test/CodeGen/PowerPC/builtins-ppc-quadword-noi128.c
index 11fb7bc5e7a79..d078caaef11dd 100644
--- a/clang/test/CodeGen/PowerPC/builtins-ppc-quadword-noi128.c
+++ b/clang/test/CodeGen/PowerPC/builtins-ppc-quadword-noi128.c
@@ -14,7 +14,7 @@
// CHECK-LE-NEXT: entry:
// CHECK-LE-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A:%.*]] to <1 x i128>
// CHECK-LE-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[B:%.*]] to <1 x i128>
-// CHECK-LE-NEXT: [[TMP2:%.*]] = tail call <1 x i128> @llvm.ppc.altivec.vsubcuq(<1 x i128> [[TMP0]], <1 x i128> [[TMP1]]) #[[ATTR3:[0-9]+]]
+// CHECK-LE-NEXT: [[TMP2:%.*]] = tail call <1 x i128> @llvm.ppc.altivec.vsubcuq(<1 x i128> [[TMP0]], <1 x i128> [[TMP1]])
// CHECK-LE-NEXT: [[TMP3:%.*]] = bitcast <1 x i128> [[TMP2]] to <16 x i8>
// CHECK-LE-NEXT: ret <16 x i8> [[TMP3]]
//
@@ -22,7 +22,7 @@
// CHECK-AIX-NEXT: entry:
// CHECK-AIX-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A:%.*]] to <1 x i128>
// CHECK-AIX-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[B:%.*]] to <1 x i128>
-// CHECK-AIX-NEXT: [[TMP2:%.*]] = tail call <1 x i128> @llvm.ppc.altivec.vsubcuq(<1 x i128> [[TMP0]], <1 x i128> [[TMP1]]) #[[ATTR3:[0-9]+]]
+// CHECK-AIX-NEXT: [[TMP2:%.*]] = tail call <1 x i128> @llvm.ppc.altivec.vsubcuq(<1 x i128> [[TMP0]], <1 x i128> [[TMP1]])
// CHECK-AIX-NEXT: [[TMP3:%.*]] = bitcast <1 x i128> [[TMP2]] to <16 x i8>
// CHECK-AIX-NEXT: ret <16 x i8> [[TMP3]]
//
@@ -34,7 +34,7 @@ vector unsigned char test_subc(vector unsigned char a, vector unsigned char b) {
// CHECK-LE-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A:%.*]] to <1 x i128>
// CHECK-LE-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[B:%.*]] to <1 x i128>
// CHECK-LE-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[C:%.*]] to <1 x i128>
-// CHECK-LE-NEXT: [[TMP3:%.*]] = tail call <1 x i128> @llvm.ppc.altivec.vsubecuq(<1 x i128> [[TMP0]], <1 x i128> [[TMP1]], <1 x i128> [[TMP2]]) #[[ATTR3]]
+// CHECK-LE-NEXT: [[TMP3:%.*]] = tail call <1 x i128> @llvm.ppc.altivec.vsubecuq(<1 x i128> [[TMP0]], <1 x i128> [[TMP1]], <1 x i128> [[TMP2]])
// CHECK-LE-NEXT: [[TMP4:%.*]] = bitcast <1 x i128> [[TMP3]] to <16 x i8>
// CHECK-LE-NEXT: ret <16 x i8> [[TMP4]]
//
@@ -43,7 +43,7 @@ vector unsigned char test_subc(vector unsigned char a, vector unsigned char b) {
// CHECK-AIX-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A:%.*]] to <1 x i128>
// CHECK-AIX-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[B:%.*]] to <1 x i128>
// CHECK-AIX-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[C:%.*]] to <1 x i128>
-// CHECK-AIX-NEXT: [[TMP3:%.*]] = tail call <1 x i128> @llvm.ppc.altivec.vsubecuq(<1 x i128> [[TMP0]], <1 x i128> [[TMP1]], <1 x i128> [[TMP2]]) #[[ATTR3]]
+// CHECK-AIX-NEXT: [[TMP3:%.*]] = tail call <1 x i128> @llvm.ppc.altivec.vsubecuq(<1 x i128> [[TMP0]], <1 x i128> [[TMP1]], <1 x i128> [[TMP2]])
// CHECK-AIX-NEXT: [[TMP4:%.*]] = bitcast <1 x i128> [[TMP3]] to <16 x i8>
// CHECK-AIX-NEXT: ret <16 x i8> [[TMP4]]
//
@@ -56,7 +56,7 @@ vector unsigned char test_subec(vector unsigned char a, vector unsigned char b,
// CHECK-LE-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A:%.*]] to <1 x i128>
// CHECK-LE-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[B:%.*]] to <1 x i128>
// CHECK-LE-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[C:%.*]] to <1 x i128>
-// CHECK-LE-NEXT: [[TMP3:%.*]] = tail call <1 x i128> @llvm.ppc.altivec.vsubeuqm(<1 x i128> [[TMP0]], <1 x i128> [[TMP1]], <1 x i128> [[TMP2]]) #[[ATTR3]]
+// CHECK-LE-NEXT: [[TMP3:%.*]] = tail call <1 x i128> @llvm.ppc.altivec.vsubeuqm(<1 x i128> [[TMP0]], <1 x i128> [[TMP1]], <1 x i128> [[TMP2]])
// CHECK-LE-NEXT: [[TMP4:%.*]] = bitcast <1 x i128> [[TMP3]] to <16 x i8>
// CHECK-LE-NEXT: ret <16 x i8> [[TMP4]]
//
@@ -65,7 +65,7 @@ vector unsigned char test_subec(vector unsigned char a, vector unsigned char b,
// CHECK-AIX-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A:%.*]] to <1 x i128>
// CHECK-AIX-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[B:%.*]] to <1 x i128>
// CHECK-AIX-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[C:%.*]] to <1 x i128>
-// CHECK-AIX-NEXT: [[TMP3:%.*]] = tail call <1 x i128> @llvm.ppc.altivec.vsubeuqm(<1 x i128> [[TMP0]], <1 x i128> [[TMP1]], <1 x i128> [[TMP2]]) #[[ATTR3]]
+// CHECK-AIX-NEXT: [[TMP3:%.*]] = tail call <1 x i128> @llvm.ppc.altivec.vsubeuqm(<1 x i128> [[TMP0]], <1 x i128> [[TMP1]], <1 x i128> [[TMP2]])
// CHECK-AIX-NEXT: [[TMP4:%.*]] = bitcast <1 x i128> [[TMP3]] to <16 x i8>
// CHECK-AIX-NEXT: ret <16 x i8> [[TMP4]]
//
@@ -101,7 +101,7 @@ vector unsigned char test_sub(vector unsigned char a, vector unsigned char b,
// CHECK-LE-NEXT: entry:
// CHECK-LE-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A:%.*]] to <1 x i128>
// CHECK-LE-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[B:%.*]] to <1 x i128>
-// CHECK-LE-NEXT: [[TMP2:%.*]] = tail call <1 x i128> @llvm.ppc.altivec.vaddcuq(<1 x i128> [[TMP0]], <1 x i128> [[TMP1]]) #[[ATTR3]]
+// CHECK-LE-NEXT: [[TMP2:%.*]] = tail call <1 x i128> @llvm.ppc.altivec.vaddcuq(<1 x i128> [[TMP0]], <1 x i128> [[TMP1]])
// CHECK-LE-NEXT: [[TMP3:%.*]] = bitcast <1 x i128> [[TMP2]] to <16 x i8>
// CHECK-LE-NEXT: ret <16 x i8> [[TMP3]]
//
@@ -109,7 +109,7 @@ vector unsigned char test_sub(vector unsigned char a, vector unsigned char b,
// CHECK-AIX-NEXT: entry:
// CHECK-AIX-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A:%.*]] to <1 x i128>
// CHECK-AIX-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[B:%.*]] to <1 x i128>
-// CHECK-AIX-NEXT: [[TMP2:%.*]] = tail call <1 x i128> @llvm.ppc.altivec.vaddcuq(<1 x i128> [[TMP0]], <1 x i128> [[TMP1]]) #[[ATTR3]]
+// CHECK-AIX-NEXT: [[TMP2:%.*]] = tail call <1 x i128> @llvm.ppc.altivec.vaddcuq(<1 x i128> [[TMP0]], <1 x i128> [[TMP1]])
// CHECK-AIX-NEXT: [[TMP3:%.*]] = bitcast <1 x i128> [[TMP2]] to <16 x i8>
// CHECK-AIX-NEXT: ret <16 x i8> [[TMP3]]
//
@@ -121,7 +121,7 @@ vector unsigned char test_addc(vector unsigned char a, vector unsigned char b) {
// CHECK-LE-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A:%.*]] to <1 x i128>
// CHECK-LE-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[B:%.*]] to <1 x i128>
// CHECK-LE-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[C:%.*]] to <1 x i128>
-// CHECK-LE-NEXT: [[TMP3:%.*]] = tail call <1 x i128> @llvm.ppc.altivec.vaddecuq(<1 x i128> [[TMP0]], <1 x i128> [[TMP1]], <1 x i128> [[TMP2]]) #[[ATTR3]]
+// CHECK-LE-NEXT: [[TMP3:%.*]] = tail call <1 x i128> @llvm.ppc.altivec.vaddecuq(<1 x i128> [[TMP0]], <1 x i128> [[TMP1]], <1 x i128> [[TMP2]])
// CHECK-LE-NEXT: [[TMP4:%.*]] = bitcast <1 x i128> [[TMP3]] to <16 x i8>
// CHECK-LE-NEXT: ret <16 x i8> [[TMP4]]
//
@@ -130,7 +130,7 @@ vector unsigned char test_addc(vector unsigned char a, vector unsigned char b) {
// CHECK-AIX-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A:%.*]] to <1 x i128>
// CHECK-AIX-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[B:%.*]] to <1 x i128>
// CHECK-AIX-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[C:%.*]] to <1 x i128>
-// CHECK-AIX-NEXT: [[TMP3:%.*]] = tail call <1 x i128> @llvm.ppc.altivec.vaddecuq(<1 x i128> [[TMP0]], <1 x i128> [[TMP1]], <1 x i128> [[TMP2]]) #[[ATTR3]]
+// CHECK-AIX-NEXT: [[TMP3:%.*]] = tail call <1 x i128> @llvm.ppc.altivec.vaddecuq(<1 x i128> [[TMP0]], <1 x i128> [[TMP1]], <1 x i128> [[TMP2]])
// CHECK-AIX-NEXT: [[TMP4:%.*]] = bitcast <1 x i128> [[TMP3]] to <16 x i8>
// CHECK-AIX-NEXT: ret <16 x i8> [[TMP4]]
//
@@ -143,7 +143,7 @@ vector unsigned char test_addec(vector unsigned char a, vector unsigned char b,
// CHECK-LE-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A:%.*]] to <1 x i128>
// CHECK-LE-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[B:%.*]] to <1 x i128>
// CHECK-LE-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[C:%.*]] to <1 x i128>
-// CHECK-LE-NEXT: [[TMP3:%.*]] = tail call <1 x i128> @llvm.ppc.altivec.vaddeuqm(<1 x i128> [[TMP0]], <1 x i128> [[TMP1]], <1 x i128> [[TMP2]]) #[[ATTR3]]
+// CHECK-LE-NEXT: [[TMP3:%.*]] = tail call <1 x i128> @llvm.ppc.altivec.vaddeuqm(<1 x i128> [[TMP0]], <1 x i128> [[TMP1]], <1 x i128> [[TMP2]])
// CHECK-LE-NEXT: [[TMP4:%.*]] = bitcast <1 x i128> [[TMP3]] to <16 x i8>
// CHECK-LE-NEXT: ret <16 x i8> [[TMP4]]
//
@@ -152,7 +152,7 @@ vector unsigned char test_addec(vector unsigned char a, vector unsigned char b,
// CHECK-AIX-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[A:%.*]] to <1 x i128>
// CHECK-AIX-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[B:%.*]] to <1 x i128>
// CHECK-AIX-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[C:%.*]] to <1 x i128>
-// CHECK-AIX-NEXT: [[TMP3:%.*]] = tail call <1 x i128> @llvm.ppc.altivec.vaddeuqm(<1 x i128> [[TMP0]], <1 x i128> [[TMP1]], <1 x i128> [[TMP2]]) #[[ATTR3]]
+// CHECK-AIX-NEXT: [[TMP3:%.*]] = tail call <1 x i128> @llvm.ppc.altivec.vaddeuqm(<1 x i128> [[TMP0]], <1 x i128> [[TMP1]], <1 x i128> [[TMP2]])
// CHECK-AIX-NEXT: [[TMP4:%.*]] = bitcast <1 x i128> [[TMP3]] to <16 x i8>
// CHECK-AIX-NEXT: ret <16 x i8> [[TMP4]]
//
diff --git a/clang/test/CodeGen/X86/keylocker.c b/clang/test/CodeGen/X86/keylocker.c
index a0aba8d775f27..127e819f7714f 100644
--- a/clang/test/CodeGen/X86/keylocker.c
+++ b/clang/test/CodeGen/X86/keylocker.c
@@ -30,7 +30,7 @@
// CHECK64-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[__ENKEY_LO_ADDR_I]], align 16
// CHECK64-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[__ENKEY_HI_ADDR_I]], align 16
// CHECK64-NEXT: [[TMP7:%.*]] = load i32, i32* [[__CTL_ADDR_I]], align 4
-// CHECK64-NEXT: call void @llvm.x86.loadiwkey(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], i32 [[TMP7]]) #[[ATTR1:[0-9]+]]
+// CHECK64-NEXT: call void @llvm.x86.loadiwkey(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], i32 [[TMP7]])
// CHECK64-NEXT: ret void
//
// CHECK32-LABEL: @test_loadiwkey(
@@ -59,7 +59,7 @@
// CHECK32-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[__ENKEY_LO_ADDR_I]], align 16
// CHECK32-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[__ENKEY_HI_ADDR_I]], align 16
// CHECK32-NEXT: [[TMP7:%.*]] = load i32, i32* [[__CTL_ADDR_I]], align 4
-// CHECK32-NEXT: call void @llvm.x86.loadiwkey(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], i32 [[TMP7]]) #[[ATTR1:[0-9]+]]
+// CHECK32-NEXT: call void @llvm.x86.loadiwkey(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], i32 [[TMP7]])
// CHECK32-NEXT: ret void
//
void test_loadiwkey(unsigned int ctl, __m128i intkey, __m128i enkey_lo, __m128i enkey_hi) {
@@ -86,7 +86,7 @@ void test_loadiwkey(unsigned int ctl, __m128i intkey, __m128i enkey_lo, __m128i
// CHECK64-NEXT: [[TMP3:%.*]] = load i32, i32* [[__HTYPE_ADDR_I]], align 4
// CHECK64-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__KEY_ADDR_I]], align 16
// CHECK64-NEXT: [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 8
-// CHECK64-NEXT: [[TMP6:%.*]] = call { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey128(i32 [[TMP3]], <2 x i64> [[TMP4]]) #[[ATTR1]]
+// CHECK64-NEXT: [[TMP6:%.*]] = call { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey128(i32 [[TMP3]], <2 x i64> [[TMP4]])
// CHECK64-NEXT: [[TMP7:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP6]], 1
// CHECK64-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP5]] to <2 x i64>*
// CHECK64-NEXT: store <2 x i64> [[TMP7]], <2 x i64>* [[TMP8]], align 1
@@ -121,7 +121,7 @@ void test_loadiwkey(unsigned int ctl, __m128i intkey, __m128i enkey_lo, __m128i
// CHECK32-NEXT: [[TMP3:%.*]] = load i32, i32* [[__HTYPE_ADDR_I]], align 4
// CHECK32-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__KEY_ADDR_I]], align 16
// CHECK32-NEXT: [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 4
-// CHECK32-NEXT: [[TMP6:%.*]] = call { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey128(i32 [[TMP3]], <2 x i64> [[TMP4]]) #[[ATTR1]]
+// CHECK32-NEXT: [[TMP6:%.*]] = call { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey128(i32 [[TMP3]], <2 x i64> [[TMP4]])
// CHECK32-NEXT: [[TMP7:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP6]], 1
// CHECK32-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP5]] to <2 x i64>*
// CHECK32-NEXT: store <2 x i64> [[TMP7]], <2 x i64>* [[TMP8]], align 1
@@ -166,7 +166,7 @@ unsigned int test_encodekey128_u32(unsigned int htype, __m128i key, void *h) {
// CHECK64-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[__KEY_LO_ADDR_I]], align 16
// CHECK64-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[__KEY_HI_ADDR_I]], align 16
// CHECK64-NEXT: [[TMP7:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 8
-// CHECK64-NEXT: [[TMP8:%.*]] = call { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey256(i32 [[TMP4]], <2 x i64> [[TMP5]], <2 x i64> [[TMP6]]) #[[ATTR1]]
+// CHECK64-NEXT: [[TMP8:%.*]] = call { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey256(i32 [[TMP4]], <2 x i64> [[TMP5]], <2 x i64> [[TMP6]])
// CHECK64-NEXT: [[TMP9:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 1
// CHECK64-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP7]] to <2 x i64>*
// CHECK64-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* [[TMP10]], align 1
@@ -211,7 +211,7 @@ unsigned int test_encodekey128_u32(unsigned int htype, __m128i key, void *h) {
// CHECK32-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[__KEY_LO_ADDR_I]], align 16
// CHECK32-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[__KEY_HI_ADDR_I]], align 16
// CHECK32-NEXT: [[TMP7:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 4
-// CHECK32-NEXT: [[TMP8:%.*]] = call { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey256(i32 [[TMP4]], <2 x i64> [[TMP5]], <2 x i64> [[TMP6]]) #[[ATTR1]]
+// CHECK32-NEXT: [[TMP8:%.*]] = call { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey256(i32 [[TMP4]], <2 x i64> [[TMP5]], <2 x i64> [[TMP6]])
// CHECK32-NEXT: [[TMP9:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 1
// CHECK32-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP7]] to <2 x i64>*
// CHECK32-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* [[TMP10]], align 1
@@ -254,7 +254,7 @@ unsigned int test_encodekey256_u32(unsigned int htype, __m128i key_lo, __m128i k
// CHECK64-NEXT: [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 8
// CHECK64-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__IDATA_ADDR_I]], align 16
// CHECK64-NEXT: [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 8
-// CHECK64-NEXT: [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesenc256kl(<2 x i64> [[TMP4]], i8* [[TMP5]]) #[[ATTR1]]
+// CHECK64-NEXT: [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesenc256kl(<2 x i64> [[TMP4]], i8* [[TMP5]])
// CHECK64-NEXT: [[TMP7:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
// CHECK64-NEXT: [[TMP8:%.*]] = trunc i8 [[TMP7]] to i1
// CHECK64-NEXT: [[TMP9:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 1
@@ -289,7 +289,7 @@ unsigned int test_encodekey256_u32(unsigned int htype, __m128i key_lo, __m128i k
// CHECK32-NEXT: [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 4
// CHECK32-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__IDATA_ADDR_I]], align 16
// CHECK32-NEXT: [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 4
-// CHECK32-NEXT: [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesenc256kl(<2 x i64> [[TMP4]], i8* [[TMP5]]) #[[ATTR1]]
+// CHECK32-NEXT: [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesenc256kl(<2 x i64> [[TMP4]], i8* [[TMP5]])
// CHECK32-NEXT: [[TMP7:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
// CHECK32-NEXT: [[TMP8:%.*]] = trunc i8 [[TMP7]] to i1
// CHECK32-NEXT: [[TMP9:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 1
@@ -328,7 +328,7 @@ unsigned char test_mm_aesenc256kl_u8(__m128i *odata, __m128i idata, const void *
// CHECK64-NEXT: [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 8
// CHECK64-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__IDATA_ADDR_I]], align 16
// CHECK64-NEXT: [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 8
-// CHECK64-NEXT: [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesdec256kl(<2 x i64> [[TMP4]], i8* [[TMP5]]) #[[ATTR1]]
+// CHECK64-NEXT: [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesdec256kl(<2 x i64> [[TMP4]], i8* [[TMP5]])
// CHECK64-NEXT: [[TMP7:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
// CHECK64-NEXT: [[TMP8:%.*]] = trunc i8 [[TMP7]] to i1
// CHECK64-NEXT: [[TMP9:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 1
@@ -363,7 +363,7 @@ unsigned char test_mm_aesenc256kl_u8(__m128i *odata, __m128i idata, const void *
// CHECK32-NEXT: [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 4
// CHECK32-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__IDATA_ADDR_I]], align 16
// CHECK32-NEXT: [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 4
-// CHECK32-NEXT: [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesdec256kl(<2 x i64> [[TMP4]], i8* [[TMP5]]) #[[ATTR1]]
+// CHECK32-NEXT: [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesdec256kl(<2 x i64> [[TMP4]], i8* [[TMP5]])
// CHECK32-NEXT: [[TMP7:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
// CHECK32-NEXT: [[TMP8:%.*]] = trunc i8 [[TMP7]] to i1
// CHECK32-NEXT: [[TMP9:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 1
@@ -402,7 +402,7 @@ unsigned char test_mm_aesdec256kl_u8(__m128i *odata, __m128i idata, const void *
// CHECK64-NEXT: [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 8
// CHECK64-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__IDATA_ADDR_I]], align 16
// CHECK64-NEXT: [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 8
-// CHECK64-NEXT: [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesenc128kl(<2 x i64> [[TMP4]], i8* [[TMP5]]) #[[ATTR1]]
+// CHECK64-NEXT: [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesenc128kl(<2 x i64> [[TMP4]], i8* [[TMP5]])
// CHECK64-NEXT: [[TMP7:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
// CHECK64-NEXT: [[TMP8:%.*]] = trunc i8 [[TMP7]] to i1
// CHECK64-NEXT: [[TMP9:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 1
@@ -437,7 +437,7 @@ unsigned char test_mm_aesdec256kl_u8(__m128i *odata, __m128i idata, const void *
// CHECK32-NEXT: [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 4
// CHECK32-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__IDATA_ADDR_I]], align 16
// CHECK32-NEXT: [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 4
-// CHECK32-NEXT: [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesenc128kl(<2 x i64> [[TMP4]], i8* [[TMP5]]) #[[ATTR1]]
+// CHECK32-NEXT: [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesenc128kl(<2 x i64> [[TMP4]], i8* [[TMP5]])
// CHECK32-NEXT: [[TMP7:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
// CHECK32-NEXT: [[TMP8:%.*]] = trunc i8 [[TMP7]] to i1
// CHECK32-NEXT: [[TMP9:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 1
@@ -476,7 +476,7 @@ unsigned char test_mm_aesenc128kl_u8(__m128i *odata, __m128i idata, const void *
// CHECK64-NEXT: [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 8
// CHECK64-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__IDATA_ADDR_I]], align 16
// CHECK64-NEXT: [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 8
-// CHECK64-NEXT: [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesdec128kl(<2 x i64> [[TMP4]], i8* [[TMP5]]) #[[ATTR1]]
+// CHECK64-NEXT: [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesdec128kl(<2 x i64> [[TMP4]], i8* [[TMP5]])
// CHECK64-NEXT: [[TMP7:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
// CHECK64-NEXT: [[TMP8:%.*]] = trunc i8 [[TMP7]] to i1
// CHECK64-NEXT: [[TMP9:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 1
@@ -511,7 +511,7 @@ unsigned char test_mm_aesenc128kl_u8(__m128i *odata, __m128i idata, const void *
// CHECK32-NEXT: [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 4
// CHECK32-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__IDATA_ADDR_I]], align 16
// CHECK32-NEXT: [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 4
-// CHECK32-NEXT: [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesdec128kl(<2 x i64> [[TMP4]], i8* [[TMP5]]) #[[ATTR1]]
+// CHECK32-NEXT: [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesdec128kl(<2 x i64> [[TMP4]], i8* [[TMP5]])
// CHECK32-NEXT: [[TMP7:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
// CHECK32-NEXT: [[TMP8:%.*]] = trunc i8 [[TMP7]] to i1
// CHECK32-NEXT: [[TMP9:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 1
@@ -565,7 +565,7 @@ unsigned char test_mm_aesdec128kl_u8(__m128i *odata, __m128i idata, const void *
// CHECK64-NEXT: [[TMP18:%.*]] = load <2 x i64>, <2 x i64>* [[TMP17]], align 16
// CHECK64-NEXT: [[TMP19:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 7
// CHECK64-NEXT: [[TMP20:%.*]] = load <2 x i64>, <2 x i64>* [[TMP19]], align 16
-// CHECK64-NEXT: [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide128kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]]) #[[ATTR1]]
+// CHECK64-NEXT: [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide128kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]])
// CHECK64-NEXT: [[TMP22:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
// CHECK64-NEXT: [[TMP23:%.*]] = trunc i8 [[TMP22]] to i1
// CHECK64-NEXT: br i1 [[TMP23]], label [[AESENCWIDE128KL_NO_ERROR_I:%.*]], label [[AESENCWIDE128KL_ERROR_I:%.*]]
@@ -658,7 +658,7 @@ unsigned char test_mm_aesdec128kl_u8(__m128i *odata, __m128i idata, const void *
// CHECK32-NEXT: [[TMP18:%.*]] = load <2 x i64>, <2 x i64>* [[TMP17]], align 16
// CHECK32-NEXT: [[TMP19:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 7
// CHECK32-NEXT: [[TMP20:%.*]] = load <2 x i64>, <2 x i64>* [[TMP19]], align 16
-// CHECK32-NEXT: [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide128kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]]) #[[ATTR1]]
+// CHECK32-NEXT: [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide128kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]])
// CHECK32-NEXT: [[TMP22:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
// CHECK32-NEXT: [[TMP23:%.*]] = trunc i8 [[TMP22]] to i1
// CHECK32-NEXT: br i1 [[TMP23]], label [[AESENCWIDE128KL_NO_ERROR_I:%.*]], label [[AESENCWIDE128KL_ERROR_I:%.*]]
@@ -755,7 +755,7 @@ unsigned char test__mm_aesencwide128kl_u8(__m128i odata[8], const __m128i idata[
// CHECK64-NEXT: [[TMP18:%.*]] = load <2 x i64>, <2 x i64>* [[TMP17]], align 16
// CHECK64-NEXT: [[TMP19:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 7
// CHECK64-NEXT: [[TMP20:%.*]] = load <2 x i64>, <2 x i64>* [[TMP19]], align 16
-// CHECK64-NEXT: [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide128kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]]) #[[ATTR1]]
+// CHECK64-NEXT: [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide128kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]])
// CHECK64-NEXT: [[TMP22:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
// CHECK64-NEXT: [[TMP23:%.*]] = trunc i8 [[TMP22]] to i1
// CHECK64-NEXT: br i1 [[TMP23]], label [[AESDECWIDE128KL_NO_ERROR_I:%.*]], label [[AESDECWIDE128KL_ERROR_I:%.*]]
@@ -848,7 +848,7 @@ unsigned char test__mm_aesencwide128kl_u8(__m128i odata[8], const __m128i idata[
// CHECK32-NEXT: [[TMP18:%.*]] = load <2 x i64>, <2 x i64>* [[TMP17]], align 16
// CHECK32-NEXT: [[TMP19:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 7
// CHECK32-NEXT: [[TMP20:%.*]] = load <2 x i64>, <2 x i64>* [[TMP19]], align 16
-// CHECK32-NEXT: [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide128kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]]) #[[ATTR1]]
+// CHECK32-NEXT: [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide128kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]])
// CHECK32-NEXT: [[TMP22:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
// CHECK32-NEXT: [[TMP23:%.*]] = trunc i8 [[TMP22]] to i1
// CHECK32-NEXT: br i1 [[TMP23]], label [[AESDECWIDE128KL_NO_ERROR_I:%.*]], label [[AESDECWIDE128KL_ERROR_I:%.*]]
@@ -945,7 +945,7 @@ unsigned char test__mm_aesdecwide128kl_u8(__m128i odata[8], const __m128i idata[
// CHECK64-NEXT: [[TMP18:%.*]] = load <2 x i64>, <2 x i64>* [[TMP17]], align 16
// CHECK64-NEXT: [[TMP19:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 7
// CHECK64-NEXT: [[TMP20:%.*]] = load <2 x i64>, <2 x i64>* [[TMP19]], align 16
-// CHECK64-NEXT: [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide256kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]]) #[[ATTR1]]
+// CHECK64-NEXT: [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide256kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]])
// CHECK64-NEXT: [[TMP22:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
// CHECK64-NEXT: [[TMP23:%.*]] = trunc i8 [[TMP22]] to i1
// CHECK64-NEXT: br i1 [[TMP23]], label [[AESENCWIDE256KL_NO_ERROR_I:%.*]], label [[AESENCWIDE256KL_ERROR_I:%.*]]
@@ -1038,7 +1038,7 @@ unsigned char test__mm_aesdecwide128kl_u8(__m128i odata[8], const __m128i idata[
// CHECK32-NEXT: [[TMP18:%.*]] = load <2 x i64>, <2 x i64>* [[TMP17]], align 16
// CHECK32-NEXT: [[TMP19:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 7
// CHECK32-NEXT: [[TMP20:%.*]] = load <2 x i64>, <2 x i64>* [[TMP19]], align 16
-// CHECK32-NEXT: [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide256kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]]) #[[ATTR1]]
+// CHECK32-NEXT: [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide256kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]])
// CHECK32-NEXT: [[TMP22:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
// CHECK32-NEXT: [[TMP23:%.*]] = trunc i8 [[TMP22]] to i1
// CHECK32-NEXT: br i1 [[TMP23]], label [[AESENCWIDE256KL_NO_ERROR_I:%.*]], label [[AESENCWIDE256KL_ERROR_I:%.*]]
@@ -1135,7 +1135,7 @@ unsigned char test__mm_aesencwide256kl_u8(__m128i odata[8], const __m128i idata[
// CHECK64-NEXT: [[TMP18:%.*]] = load <2 x i64>, <2 x i64>* [[TMP17]], align 16
// CHECK64-NEXT: [[TMP19:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 7
// CHECK64-NEXT: [[TMP20:%.*]] = load <2 x i64>, <2 x i64>* [[TMP19]], align 16
-// CHECK64-NEXT: [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide256kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]]) #[[ATTR1]]
+// CHECK64-NEXT: [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide256kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]])
// CHECK64-NEXT: [[TMP22:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
// CHECK64-NEXT: [[TMP23:%.*]] = trunc i8 [[TMP22]] to i1
// CHECK64-NEXT: br i1 [[TMP23]], label [[AESDECWIDE256KL_NO_ERROR_I:%.*]], label [[AESDECWIDE256KL_ERROR_I:%.*]]
@@ -1228,7 +1228,7 @@ unsigned char test__mm_aesencwide256kl_u8(__m128i odata[8], const __m128i idata[
// CHECK32-NEXT: [[TMP18:%.*]] = load <2 x i64>, <2 x i64>* [[TMP17]], align 16
// CHECK32-NEXT: [[TMP19:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 7
// CHECK32-NEXT: [[TMP20:%.*]] = load <2 x i64>, <2 x i64>* [[TMP19]], align 16
-// CHECK32-NEXT: [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide256kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]]) #[[ATTR1]]
+// CHECK32-NEXT: [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide256kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]])
// CHECK32-NEXT: [[TMP22:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
// CHECK32-NEXT: [[TMP23:%.*]] = trunc i8 [[TMP22]] to i1
// CHECK32-NEXT: br i1 [[TMP23]], label [[AESDECWIDE256KL_NO_ERROR_I:%.*]], label [[AESDECWIDE256KL_ERROR_I:%.*]]
diff --git a/clang/test/CodeGen/aarch64-bf16-dotprod-intrinsics.c b/clang/test/CodeGen/aarch64-bf16-dotprod-intrinsics.c
index b4faa3b9da851..0ac2151a8fb79 100644
--- a/clang/test/CodeGen/aarch64-bf16-dotprod-intrinsics.c
+++ b/clang/test/CodeGen/aarch64-bf16-dotprod-intrinsics.c
@@ -11,7 +11,7 @@
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[R:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R]], <4 x bfloat> [[A]], <4 x bfloat> [[B]]) #[[ATTR3:[0-9]+]]
+// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R]], <4 x bfloat> [[A]], <4 x bfloat> [[B]])
// CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]]
//
float32x2_t test_vbfdot_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) {
@@ -23,7 +23,7 @@ float32x2_t test_vbfdot_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) {
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]]) #[[ATTR3]]
+// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]])
// CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]]
//
float32x4_t test_vbfdotq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b){
@@ -46,7 +46,7 @@ float32x4_t test_vbfdotq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b){
// CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x float> [[R:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x bfloat> [[TMP5]] to <8 x i8>
-// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R]], <4 x bfloat> [[A]], <4 x bfloat> [[TMP5]]) #[[ATTR3]]
+// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R]], <4 x bfloat> [[A]], <4 x bfloat> [[TMP5]])
// CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]]
//
float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){
@@ -69,7 +69,7 @@ float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){
// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x bfloat> [[TMP5]] to <16 x i8>
-// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[TMP5]]) #[[ATTR3]]
+// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[TMP5]])
// CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]]
//
float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
@@ -92,7 +92,7 @@ float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b
// CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x float> [[R:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x bfloat> [[TMP5]] to <8 x i8>
-// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R]], <4 x bfloat> [[A]], <4 x bfloat> [[TMP5]]) #[[ATTR3]]
+// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R]], <4 x bfloat> [[A]], <4 x bfloat> [[TMP5]])
// CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]]
//
float32x2_t test_vbfdot_laneq_f32(float32x2_t r, bfloat16x4_t a, bfloat16x8_t b) {
@@ -115,7 +115,7 @@ float32x2_t test_vbfdot_laneq_f32(float32x2_t r, bfloat16x4_t a, bfloat16x8_t b)
// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x bfloat> [[TMP5]] to <16 x i8>
-// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[TMP5]]) #[[ATTR3]]
+// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[TMP5]])
// CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]]
//
float32x4_t test_vbfdotq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {
@@ -127,7 +127,7 @@ float32x4_t test_vbfdotq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT: [[VBFMMLAQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]]) #[[ATTR3]]
+// CHECK-NEXT: [[VBFMMLAQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]])
// CHECK-NEXT: [[VBFMMLAQ_V4_I:%.*]] = bitcast <4 x float> [[VBFMMLAQ_V3_I]] to <16 x i8>
// CHECK-NEXT: ret <4 x float> [[VBFMMLAQ_V3_I]]
//
@@ -140,7 +140,7 @@ float32x4_t test_vbfmmlaq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT: [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]]) #[[ATTR3]]
+// CHECK-NEXT: [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]])
// CHECK-NEXT: [[VBFMLALBQ_V4_I:%.*]] = bitcast <4 x float> [[VBFMLALBQ_V3_I]] to <16 x i8>
// CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_V3_I]]
//
@@ -153,7 +153,7 @@ float32x4_t test_vbfmlalbq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT: [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]]) #[[ATTR3]]
+// CHECK-NEXT: [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]])
// CHECK-NEXT: [[VBFMLALTQ_V4_I:%.*]] = bitcast <4 x float> [[VBFMLALTQ_V3_I]] to <16 x i8>
// CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_V3_I]]
//
@@ -182,7 +182,7 @@ float32x4_t test_vbfmlaltq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8>
-// CHECK-NEXT: [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]]) #[[ATTR3]]
+// CHECK-NEXT: [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]])
// CHECK-NEXT: [[VBFMLALBQ_V4_I:%.*]] = bitcast <4 x float> [[VBFMLALBQ_V3_I]] to <16 x i8>
// CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_V3_I]]
//
@@ -211,7 +211,7 @@ float32x4_t test_vbfmlalbq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8>
-// CHECK-NEXT: [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]]) #[[ATTR3]]
+// CHECK-NEXT: [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]])
// CHECK-NEXT: [[VBFMLALBQ_V4_I:%.*]] = bitcast <4 x float> [[VBFMLALBQ_V3_I]] to <16 x i8>
// CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_V3_I]]
//
@@ -240,7 +240,7 @@ float32x4_t test_vbfmlalbq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8>
-// CHECK-NEXT: [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]]) #[[ATTR3]]
+// CHECK-NEXT: [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]])
// CHECK-NEXT: [[VBFMLALTQ_V4_I:%.*]] = bitcast <4 x float> [[VBFMLALTQ_V3_I]] to <16 x i8>
// CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_V3_I]]
//
@@ -269,7 +269,7 @@ float32x4_t test_vbfmlaltq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8>
-// CHECK-NEXT: [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]]) #[[ATTR3]]
+// CHECK-NEXT: [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]])
// CHECK-NEXT: [[VBFMLALTQ_V4_I:%.*]] = bitcast <4 x float> [[VBFMLALTQ_V3_I]] to <16 x i8>
// CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_V3_I]]
//
diff --git a/clang/test/CodeGen/aarch64-ls64.c b/clang/test/CodeGen/aarch64-ls64.c
index dcf8af7438f37..9b2f0a731181c 100644
--- a/clang/test/CodeGen/aarch64-ls64.c
+++ b/clang/test/CodeGen/aarch64-ls64.c
@@ -26,7 +26,7 @@ uint64_t status;
// CHECK-C-NEXT: [[TMP1:%.*]] = load i8*, i8** [[__ADDR_ADDR_I]], align 8, !noalias !6
// CHECK-C-NEXT: [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_DATA512_T]], %struct.data512_t* [[TMP]], i32 0, i32 0
// CHECK-C-NEXT: [[ARRAYDECAY_I:%.*]] = getelementptr inbounds [8 x i64], [8 x i64]* [[VAL_I]], i64 0, i64 0
-// CHECK-C-NEXT: [[TMP2:%.*]] = call { i64, i64, i64, i64, i64, i64, i64, i64 } @llvm.aarch64.ld64b(i8* [[TMP1]]) #[[ATTR2:[0-9]+]], !noalias !6
+// CHECK-C-NEXT: [[TMP2:%.*]] = call { i64, i64, i64, i64, i64, i64, i64, i64 } @llvm.aarch64.ld64b(i8* [[TMP1]]), !noalias !6
// CHECK-C-NEXT: [[TMP3:%.*]] = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } [[TMP2]], 0
// CHECK-C-NEXT: store i64 [[TMP3]], i64* [[ARRAYDECAY_I]], align 8, !alias.scope !6
// CHECK-C-NEXT: [[TMP4:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 1
@@ -64,7 +64,7 @@ uint64_t status;
// CHECK-CXX-NEXT: [[TMP1:%.*]] = load i8*, i8** [[__ADDR_ADDR_I]], align 8, !noalias !6
// CHECK-CXX-NEXT: [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_DATA512_T]], %struct.data512_t* [[REF_TMP]], i32 0, i32 0
// CHECK-CXX-NEXT: [[ARRAYDECAY_I:%.*]] = getelementptr inbounds [8 x i64], [8 x i64]* [[VAL_I]], i64 0, i64 0
-// CHECK-CXX-NEXT: [[TMP2:%.*]] = call { i64, i64, i64, i64, i64, i64, i64, i64 } @llvm.aarch64.ld64b(i8* [[TMP1]]) #[[ATTR2:[0-9]+]], !noalias !6
+// CHECK-CXX-NEXT: [[TMP2:%.*]] = call { i64, i64, i64, i64, i64, i64, i64, i64 } @llvm.aarch64.ld64b(i8* [[TMP1]]), !noalias !6
// CHECK-CXX-NEXT: [[TMP3:%.*]] = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } [[TMP2]], 0
// CHECK-CXX-NEXT: store i64 [[TMP3]], i64* [[ARRAYDECAY_I]], align 8, !alias.scope !6
// CHECK-CXX-NEXT: [[TMP4:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 1
@@ -123,7 +123,7 @@ EXTERN_C void test_ld64b(void)
// CHECK-C-NEXT: [[TMP15:%.*]] = load i64, i64* [[TMP14]], align 8
// CHECK-C-NEXT: [[TMP16:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 7
// CHECK-C-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP16]], align 8
-// CHECK-C-NEXT: call void @llvm.aarch64.st64b(i8* [[TMP2]], i64 [[TMP3]], i64 [[TMP5]], i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]], i64 [[TMP15]], i64 [[TMP17]]) #[[ATTR2]]
+// CHECK-C-NEXT: call void @llvm.aarch64.st64b(i8* [[TMP2]], i64 [[TMP3]], i64 [[TMP5]], i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]], i64 [[TMP15]], i64 [[TMP17]])
// CHECK-C-NEXT: ret void
//
// CHECK-CXX-LABEL: @test_st64b(
@@ -152,7 +152,7 @@ EXTERN_C void test_ld64b(void)
// CHECK-CXX-NEXT: [[TMP15:%.*]] = load i64, i64* [[TMP14]], align 8
// CHECK-CXX-NEXT: [[TMP16:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 7
// CHECK-CXX-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP16]], align 8
-// CHECK-CXX-NEXT: call void @llvm.aarch64.st64b(i8* [[TMP2]], i64 [[TMP3]], i64 [[TMP5]], i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]], i64 [[TMP15]], i64 [[TMP17]]) #[[ATTR2]]
+// CHECK-CXX-NEXT: call void @llvm.aarch64.st64b(i8* [[TMP2]], i64 [[TMP3]], i64 [[TMP5]], i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]], i64 [[TMP15]], i64 [[TMP17]])
// CHECK-CXX-NEXT: ret void
//
EXTERN_C void test_st64b(void)
@@ -186,7 +186,7 @@ EXTERN_C void test_st64b(void)
// CHECK-C-NEXT: [[TMP15:%.*]] = load i64, i64* [[TMP14]], align 8
// CHECK-C-NEXT: [[TMP16:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 7
// CHECK-C-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP16]], align 8
-// CHECK-C-NEXT: [[TMP18:%.*]] = call i64 @llvm.aarch64.st64bv(i8* [[TMP2]], i64 [[TMP3]], i64 [[TMP5]], i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]], i64 [[TMP15]], i64 [[TMP17]]) #[[ATTR2]]
+// CHECK-C-NEXT: [[TMP18:%.*]] = call i64 @llvm.aarch64.st64bv(i8* [[TMP2]], i64 [[TMP3]], i64 [[TMP5]], i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]], i64 [[TMP15]], i64 [[TMP17]])
// CHECK-C-NEXT: store i64 [[TMP18]], i64* @status, align 8
// CHECK-C-NEXT: ret void
//
@@ -216,7 +216,7 @@ EXTERN_C void test_st64b(void)
// CHECK-CXX-NEXT: [[TMP15:%.*]] = load i64, i64* [[TMP14]], align 8
// CHECK-CXX-NEXT: [[TMP16:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 7
// CHECK-CXX-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP16]], align 8
-// CHECK-CXX-NEXT: [[TMP18:%.*]] = call i64 @llvm.aarch64.st64bv(i8* [[TMP2]], i64 [[TMP3]], i64 [[TMP5]], i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]], i64 [[TMP15]], i64 [[TMP17]]) #[[ATTR2]]
+// CHECK-CXX-NEXT: [[TMP18:%.*]] = call i64 @llvm.aarch64.st64bv(i8* [[TMP2]], i64 [[TMP3]], i64 [[TMP5]], i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]], i64 [[TMP15]], i64 [[TMP17]])
// CHECK-CXX-NEXT: store i64 [[TMP18]], i64* @status, align 8
// CHECK-CXX-NEXT: ret void
//
@@ -251,7 +251,7 @@ EXTERN_C void test_st64bv(void)
// CHECK-C-NEXT: [[TMP15:%.*]] = load i64, i64* [[TMP14]], align 8
// CHECK-C-NEXT: [[TMP16:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 7
// CHECK-C-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP16]], align 8
-// CHECK-C-NEXT: [[TMP18:%.*]] = call i64 @llvm.aarch64.st64bv0(i8* [[TMP2]], i64 [[TMP3]], i64 [[TMP5]], i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]], i64 [[TMP15]], i64 [[TMP17]]) #[[ATTR2]]
+// CHECK-C-NEXT: [[TMP18:%.*]] = call i64 @llvm.aarch64.st64bv0(i8* [[TMP2]], i64 [[TMP3]], i64 [[TMP5]], i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]], i64 [[TMP15]], i64 [[TMP17]])
// CHECK-C-NEXT: store i64 [[TMP18]], i64* @status, align 8
// CHECK-C-NEXT: ret void
//
@@ -281,7 +281,7 @@ EXTERN_C void test_st64bv(void)
// CHECK-CXX-NEXT: [[TMP15:%.*]] = load i64, i64* [[TMP14]], align 8
// CHECK-CXX-NEXT: [[TMP16:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 7
// CHECK-CXX-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP16]], align 8
-// CHECK-CXX-NEXT: [[TMP18:%.*]] = call i64 @llvm.aarch64.st64bv0(i8* [[TMP2]], i64 [[TMP3]], i64 [[TMP5]], i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]], i64 [[TMP15]], i64 [[TMP17]]) #[[ATTR2]]
+// CHECK-CXX-NEXT: [[TMP18:%.*]] = call i64 @llvm.aarch64.st64bv0(i8* [[TMP2]], i64 [[TMP3]], i64 [[TMP5]], i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]], i64 [[TMP15]], i64 [[TMP17]])
// CHECK-CXX-NEXT: store i64 [[TMP18]], i64* @status, align 8
// CHECK-CXX-NEXT: ret void
//
diff --git a/clang/test/CodeGen/aarch64-neon-2velem.c b/clang/test/CodeGen/aarch64-neon-2velem.c
index 21a8c4cb6611e..9f551bc92ef48 100644
--- a/clang/test/CodeGen/aarch64-neon-2velem.c
+++ b/clang/test/CodeGen/aarch64-neon-2velem.c
@@ -653,7 +653,7 @@ float64_t test_vfmsd_laneq_f64(float64_t a, float64_t b, float64x2_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4:[0-9]+]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
// CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[ADD]]
//
@@ -668,7 +668,7 @@ int32x4_t test_vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
// CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[ADD]]
//
@@ -683,7 +683,7 @@ int64x2_t test_vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
// CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[ADD]]
//
@@ -698,7 +698,7 @@ int32x4_t test_vmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
// CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[ADD]]
//
@@ -714,7 +714,7 @@ int64x2_t test_vmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
// CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[ADD]]
//
@@ -730,7 +730,7 @@ int32x4_t test_vmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
// CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[ADD]]
//
@@ -746,7 +746,7 @@ int64x2_t test_vmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
// CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[ADD]]
//
@@ -762,7 +762,7 @@ int32x4_t test_vmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
// CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[ADD]]
//
@@ -777,7 +777,7 @@ int64x2_t test_vmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[SUB]]
//
@@ -792,7 +792,7 @@ int32x4_t test_vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
// CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[SUB]]
//
@@ -807,7 +807,7 @@ int64x2_t test_vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[SUB]]
//
@@ -822,7 +822,7 @@ int32x4_t test_vmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
// CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[SUB]]
//
@@ -838,7 +838,7 @@ int64x2_t test_vmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[SUB]]
//
@@ -854,7 +854,7 @@ int32x4_t test_vmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
// CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[SUB]]
//
@@ -870,7 +870,7 @@ int64x2_t test_vmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[SUB]]
//
@@ -886,7 +886,7 @@ int32x4_t test_vmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
// CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[SUB]]
//
@@ -901,7 +901,7 @@ int64x2_t test_vmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
// CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[ADD]]
//
@@ -916,7 +916,7 @@ int32x4_t test_vmlal_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
// CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[ADD]]
//
@@ -931,7 +931,7 @@ int64x2_t test_vmlal_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
// CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[ADD]]
//
@@ -946,7 +946,7 @@ int32x4_t test_vmlal_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
// CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[ADD]]
//
@@ -962,7 +962,7 @@ int64x2_t test_vmlal_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
// CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[ADD]]
//
@@ -978,7 +978,7 @@ int32x4_t test_vmlal_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
// CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[ADD]]
//
@@ -994,7 +994,7 @@ int64x2_t test_vmlal_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
// CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[ADD]]
//
@@ -1010,7 +1010,7 @@ int32x4_t test_vmlal_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
// CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[ADD]]
//
@@ -1025,7 +1025,7 @@ int64x2_t test_vmlal_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[SUB]]
//
@@ -1040,7 +1040,7 @@ int32x4_t test_vmlsl_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
// CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[SUB]]
//
@@ -1055,7 +1055,7 @@ int64x2_t test_vmlsl_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[SUB]]
//
@@ -1070,7 +1070,7 @@ int32x4_t test_vmlsl_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
// CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[SUB]]
//
@@ -1086,7 +1086,7 @@ int64x2_t test_vmlsl_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[SUB]]
//
@@ -1102,7 +1102,7 @@ int32x4_t test_vmlsl_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
// CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[SUB]]
//
@@ -1118,7 +1118,7 @@ int64x2_t test_vmlsl_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[SUB]]
//
@@ -1134,7 +1134,7 @@ int32x4_t test_vmlsl_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
// CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[SUB]]
//
@@ -1149,7 +1149,7 @@ int64x2_t test_vmlsl_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
// CHECK-NEXT: ret <4 x i32> [[VMULL2_I]]
//
int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t v) {
@@ -1163,7 +1163,7 @@ int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
// CHECK-NEXT: ret <2 x i64> [[VMULL2_I]]
//
int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t v) {
@@ -1177,7 +1177,7 @@ int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
// CHECK-NEXT: ret <4 x i32> [[VMULL2_I]]
//
uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t v) {
@@ -1191,7 +1191,7 @@ uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
// CHECK-NEXT: ret <2 x i64> [[VMULL2_I]]
//
uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t v) {
@@ -1206,7 +1206,7 @@ uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
// CHECK-NEXT: ret <4 x i32> [[VMULL2_I]]
//
int32x4_t test_vmull_high_lane_s16(int16x8_t a, int16x4_t v) {
@@ -1221,7 +1221,7 @@ int32x4_t test_vmull_high_lane_s16(int16x8_t a, int16x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
// CHECK-NEXT: ret <2 x i64> [[VMULL2_I]]
//
int64x2_t test_vmull_high_lane_s32(int32x4_t a, int32x2_t v) {
@@ -1236,7 +1236,7 @@ int64x2_t test_vmull_high_lane_s32(int32x4_t a, int32x2_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
// CHECK-NEXT: ret <4 x i32> [[VMULL2_I]]
//
uint32x4_t test_vmull_high_lane_u16(uint16x8_t a, uint16x4_t v) {
@@ -1251,7 +1251,7 @@ uint32x4_t test_vmull_high_lane_u16(uint16x8_t a, uint16x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
// CHECK-NEXT: ret <2 x i64> [[VMULL2_I]]
//
uint64x2_t test_vmull_high_lane_u32(uint32x4_t a, uint32x2_t v) {
@@ -1265,7 +1265,7 @@ uint64x2_t test_vmull_high_lane_u32(uint32x4_t a, uint32x2_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
// CHECK-NEXT: ret <4 x i32> [[VMULL2_I]]
//
int32x4_t test_vmull_laneq_s16(int16x4_t a, int16x8_t v) {
@@ -1279,7 +1279,7 @@ int32x4_t test_vmull_laneq_s16(int16x4_t a, int16x8_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
// CHECK-NEXT: ret <2 x i64> [[VMULL2_I]]
//
int64x2_t test_vmull_laneq_s32(int32x2_t a, int32x4_t v) {
@@ -1293,7 +1293,7 @@ int64x2_t test_vmull_laneq_s32(int32x2_t a, int32x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
// CHECK-NEXT: ret <4 x i32> [[VMULL2_I]]
//
uint32x4_t test_vmull_laneq_u16(uint16x4_t a, uint16x8_t v) {
@@ -1307,7 +1307,7 @@ uint32x4_t test_vmull_laneq_u16(uint16x4_t a, uint16x8_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
// CHECK-NEXT: ret <2 x i64> [[VMULL2_I]]
//
uint64x2_t test_vmull_laneq_u32(uint32x2_t a, uint32x4_t v) {
@@ -1322,7 +1322,7 @@ uint64x2_t test_vmull_laneq_u32(uint32x2_t a, uint32x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
// CHECK-NEXT: ret <4 x i32> [[VMULL2_I]]
//
int32x4_t test_vmull_high_laneq_s16(int16x8_t a, int16x8_t v) {
@@ -1337,7 +1337,7 @@ int32x4_t test_vmull_high_laneq_s16(int16x8_t a, int16x8_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
// CHECK-NEXT: ret <2 x i64> [[VMULL2_I]]
//
int64x2_t test_vmull_high_laneq_s32(int32x4_t a, int32x4_t v) {
@@ -1352,7 +1352,7 @@ int64x2_t test_vmull_high_laneq_s32(int32x4_t a, int32x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
// CHECK-NEXT: ret <4 x i32> [[VMULL2_I]]
//
uint32x4_t test_vmull_high_laneq_u16(uint16x8_t a, uint16x8_t v) {
@@ -1367,7 +1367,7 @@ uint32x4_t test_vmull_high_laneq_u16(uint16x8_t a, uint16x8_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
// CHECK-NEXT: ret <2 x i64> [[VMULL2_I]]
//
uint64x2_t test_vmull_high_laneq_u32(uint32x4_t a, uint32x4_t v) {
@@ -1382,8 +1382,8 @@ uint64x2_t test_vmull_high_laneq_u32(uint32x4_t a, uint32x4_t v) {
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]]
-// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
+// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
// CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]]
//
int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
@@ -1398,8 +1398,8 @@ int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]]
-// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
+// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
// CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]]
//
int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
@@ -1415,8 +1415,8 @@ int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
-// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
+// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
// CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]]
//
int32x4_t test_vqdmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
@@ -1432,8 +1432,8 @@ int32x4_t test_vqdmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
-// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
+// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
// CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]]
//
int64x2_t test_vqdmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
@@ -1448,8 +1448,8 @@ int64x2_t test_vqdmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]]
-// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
+// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
// CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]]
//
int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
@@ -1464,8 +1464,8 @@ int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]]
-// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
+// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
// CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]]
//
int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
@@ -1481,8 +1481,8 @@ int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
-// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
+// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
// CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]]
//
int32x4_t test_vqdmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
@@ -1498,8 +1498,8 @@ int32x4_t test_vqdmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
-// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
+// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
// CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]]
//
int64x2_t test_vqdmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
@@ -1513,7 +1513,7 @@ int64x2_t test_vqdmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
// CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]]
//
@@ -1528,7 +1528,7 @@ int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
// CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]]
//
@@ -1543,7 +1543,7 @@ int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
// CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]]
//
@@ -1558,7 +1558,7 @@ int32x4_t test_vqdmull_laneq_s16(int16x4_t a, int16x8_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
// CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]]
//
@@ -1574,7 +1574,7 @@ int64x2_t test_vqdmull_laneq_s32(int32x2_t a, int32x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
// CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]]
//
@@ -1590,7 +1590,7 @@ int32x4_t test_vqdmull_high_lane_s16(int16x8_t a, int16x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
// CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]]
//
@@ -1606,7 +1606,7 @@ int64x2_t test_vqdmull_high_lane_s32(int32x4_t a, int32x2_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
// CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]]
//
@@ -1622,7 +1622,7 @@ int32x4_t test_vqdmull_high_laneq_s16(int16x8_t a, int16x8_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
// CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]]
//
@@ -1844,7 +1844,7 @@ float64x2_t test_vmulq_laneq_f64(float64x2_t a, float64x2_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]])
// CHECK-NEXT: ret <2 x float> [[VMULX2_I]]
//
float32x2_t test_vmulx_lane_f32(float32x2_t a, float32x2_t v) {
@@ -1858,7 +1858,7 @@ float32x2_t test_vmulx_lane_f32(float32x2_t a, float32x2_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8>
-// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]])
// CHECK-NEXT: ret <4 x float> [[VMULX2_I]]
//
float32x4_t test_vmulxq_lane_f32(float32x4_t a, float32x2_t v) {
@@ -1872,7 +1872,7 @@ float32x4_t test_vmulxq_lane_f32(float32x4_t a, float32x2_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8>
-// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]])
// CHECK-NEXT: ret <2 x double> [[VMULX2_I]]
//
float64x2_t test_vmulxq_lane_f64(float64x2_t a, float64x1_t v) {
@@ -1886,7 +1886,7 @@ float64x2_t test_vmulxq_lane_f64(float64x2_t a, float64x1_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> <i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]])
// CHECK-NEXT: ret <2 x float> [[VMULX2_I]]
//
float32x2_t test_vmulx_laneq_f32(float32x2_t a, float32x4_t v) {
@@ -1900,7 +1900,7 @@ float32x2_t test_vmulx_laneq_f32(float32x2_t a, float32x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8>
-// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]])
// CHECK-NEXT: ret <4 x float> [[VMULX2_I]]
//
float32x4_t test_vmulxq_laneq_f32(float32x4_t a, float32x4_t v) {
@@ -1914,7 +1914,7 @@ float32x4_t test_vmulxq_laneq_f32(float32x4_t a, float32x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> <i32 1, i32 1>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8>
-// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]])
// CHECK-NEXT: ret <2 x double> [[VMULX2_I]]
//
float64x2_t test_vmulxq_laneq_f64(float64x2_t a, float64x2_t v) {
@@ -2493,7 +2493,7 @@ float64x2_t test_vfmsq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v)
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
// CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[ADD]]
//
@@ -2508,7 +2508,7 @@ int32x4_t test_vmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
// CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[ADD]]
//
@@ -2523,7 +2523,7 @@ int64x2_t test_vmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
// CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[ADD]]
//
@@ -2538,7 +2538,7 @@ int32x4_t test_vmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
// CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[ADD]]
//
@@ -2554,7 +2554,7 @@ int64x2_t test_vmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
// CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[ADD]]
//
@@ -2570,7 +2570,7 @@ int32x4_t test_vmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
// CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[ADD]]
//
@@ -2586,7 +2586,7 @@ int64x2_t test_vmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
// CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[ADD]]
//
@@ -2602,7 +2602,7 @@ int32x4_t test_vmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
// CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[ADD]]
//
@@ -2617,7 +2617,7 @@ int64x2_t test_vmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[SUB]]
//
@@ -2632,7 +2632,7 @@ int32x4_t test_vmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
// CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[SUB]]
//
@@ -2647,7 +2647,7 @@ int64x2_t test_vmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[SUB]]
//
@@ -2662,7 +2662,7 @@ int32x4_t test_vmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
// CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[SUB]]
//
@@ -2678,7 +2678,7 @@ int64x2_t test_vmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[SUB]]
//
@@ -2694,7 +2694,7 @@ int32x4_t test_vmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
// CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[SUB]]
//
@@ -2710,7 +2710,7 @@ int64x2_t test_vmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[SUB]]
//
@@ -2726,7 +2726,7 @@ int32x4_t test_vmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
// CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[SUB]]
//
@@ -2741,7 +2741,7 @@ int64x2_t test_vmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
// CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[ADD]]
//
@@ -2756,7 +2756,7 @@ int32x4_t test_vmlal_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
// CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[ADD]]
//
@@ -2771,7 +2771,7 @@ int64x2_t test_vmlal_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
// CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[ADD]]
//
@@ -2786,7 +2786,7 @@ int32x4_t test_vmlal_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
// CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[ADD]]
//
@@ -2802,7 +2802,7 @@ int64x2_t test_vmlal_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
// CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[ADD]]
//
@@ -2818,7 +2818,7 @@ int32x4_t test_vmlal_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
// CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[ADD]]
//
@@ -2834,7 +2834,7 @@ int64x2_t test_vmlal_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
// CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[ADD]]
//
@@ -2850,7 +2850,7 @@ int32x4_t test_vmlal_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
// CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[ADD]]
//
@@ -2865,7 +2865,7 @@ int64x2_t test_vmlal_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[SUB]]
//
@@ -2880,7 +2880,7 @@ int32x4_t test_vmlsl_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
// CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[SUB]]
//
@@ -2895,7 +2895,7 @@ int64x2_t test_vmlsl_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[SUB]]
//
@@ -2910,7 +2910,7 @@ int32x4_t test_vmlsl_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
// CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[SUB]]
//
@@ -2926,7 +2926,7 @@ int64x2_t test_vmlsl_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[SUB]]
//
@@ -2942,7 +2942,7 @@ int32x4_t test_vmlsl_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
// CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[SUB]]
//
@@ -2958,7 +2958,7 @@ int64x2_t test_vmlsl_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
// CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <4 x i32> [[SUB]]
//
@@ -2974,7 +2974,7 @@ int32x4_t test_vmlsl_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
// CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]]
// CHECK-NEXT: ret <2 x i64> [[SUB]]
//
@@ -2989,7 +2989,7 @@ int64x2_t test_vmlsl_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
// CHECK-NEXT: ret <4 x i32> [[VMULL2_I]]
//
int32x4_t test_vmull_lane_s16_0(int16x4_t a, int16x4_t v) {
@@ -3003,7 +3003,7 @@ int32x4_t test_vmull_lane_s16_0(int16x4_t a, int16x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
// CHECK-NEXT: ret <2 x i64> [[VMULL2_I]]
//
int64x2_t test_vmull_lane_s32_0(int32x2_t a, int32x2_t v) {
@@ -3017,7 +3017,7 @@ int64x2_t test_vmull_lane_s32_0(int32x2_t a, int32x2_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
// CHECK-NEXT: ret <4 x i32> [[VMULL2_I]]
//
uint32x4_t test_vmull_lane_u16_0(uint16x4_t a, uint16x4_t v) {
@@ -3031,7 +3031,7 @@ uint32x4_t test_vmull_lane_u16_0(uint16x4_t a, uint16x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
// CHECK-NEXT: ret <2 x i64> [[VMULL2_I]]
//
uint64x2_t test_vmull_lane_u32_0(uint32x2_t a, uint32x2_t v) {
@@ -3046,7 +3046,7 @@ uint64x2_t test_vmull_lane_u32_0(uint32x2_t a, uint32x2_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
// CHECK-NEXT: ret <4 x i32> [[VMULL2_I]]
//
int32x4_t test_vmull_high_lane_s16_0(int16x8_t a, int16x4_t v) {
@@ -3061,7 +3061,7 @@ int32x4_t test_vmull_high_lane_s16_0(int16x8_t a, int16x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
// CHECK-NEXT: ret <2 x i64> [[VMULL2_I]]
//
int64x2_t test_vmull_high_lane_s32_0(int32x4_t a, int32x2_t v) {
@@ -3076,7 +3076,7 @@ int64x2_t test_vmull_high_lane_s32_0(int32x4_t a, int32x2_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
// CHECK-NEXT: ret <4 x i32> [[VMULL2_I]]
//
uint32x4_t test_vmull_high_lane_u16_0(uint16x8_t a, uint16x4_t v) {
@@ -3091,7 +3091,7 @@ uint32x4_t test_vmull_high_lane_u16_0(uint16x8_t a, uint16x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
// CHECK-NEXT: ret <2 x i64> [[VMULL2_I]]
//
uint64x2_t test_vmull_high_lane_u32_0(uint32x4_t a, uint32x2_t v) {
@@ -3105,7 +3105,7 @@ uint64x2_t test_vmull_high_lane_u32_0(uint32x4_t a, uint32x2_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
// CHECK-NEXT: ret <4 x i32> [[VMULL2_I]]
//
int32x4_t test_vmull_laneq_s16_0(int16x4_t a, int16x8_t v) {
@@ -3119,7 +3119,7 @@ int32x4_t test_vmull_laneq_s16_0(int16x4_t a, int16x8_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
// CHECK-NEXT: ret <2 x i64> [[VMULL2_I]]
//
int64x2_t test_vmull_laneq_s32_0(int32x2_t a, int32x4_t v) {
@@ -3133,7 +3133,7 @@ int64x2_t test_vmull_laneq_s32_0(int32x2_t a, int32x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
// CHECK-NEXT: ret <4 x i32> [[VMULL2_I]]
//
uint32x4_t test_vmull_laneq_u16_0(uint16x4_t a, uint16x8_t v) {
@@ -3147,7 +3147,7 @@ uint32x4_t test_vmull_laneq_u16_0(uint16x4_t a, uint16x8_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
// CHECK-NEXT: ret <2 x i64> [[VMULL2_I]]
//
uint64x2_t test_vmull_laneq_u32_0(uint32x2_t a, uint32x4_t v) {
@@ -3162,7 +3162,7 @@ uint64x2_t test_vmull_laneq_u32_0(uint32x2_t a, uint32x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
// CHECK-NEXT: ret <4 x i32> [[VMULL2_I]]
//
int32x4_t test_vmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) {
@@ -3177,7 +3177,7 @@ int32x4_t test_vmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
// CHECK-NEXT: ret <2 x i64> [[VMULL2_I]]
//
int64x2_t test_vmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) {
@@ -3192,7 +3192,7 @@ int64x2_t test_vmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
// CHECK-NEXT: ret <4 x i32> [[VMULL2_I]]
//
uint32x4_t test_vmull_high_laneq_u16_0(uint16x8_t a, uint16x8_t v) {
@@ -3207,7 +3207,7 @@ uint32x4_t test_vmull_high_laneq_u16_0(uint16x8_t a, uint16x8_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
// CHECK-NEXT: ret <2 x i64> [[VMULL2_I]]
//
uint64x2_t test_vmull_high_laneq_u32_0(uint32x4_t a, uint32x4_t v) {
@@ -3222,8 +3222,8 @@ uint64x2_t test_vmull_high_laneq_u32_0(uint32x4_t a, uint32x4_t v) {
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]]
-// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
+// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
// CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]]
//
int32x4_t test_vqdmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
@@ -3238,8 +3238,8 @@ int32x4_t test_vqdmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]]
-// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
+// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
// CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]]
//
int64x2_t test_vqdmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
@@ -3255,8 +3255,8 @@ int64x2_t test_vqdmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
-// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
+// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
// CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]]
//
int32x4_t test_vqdmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
@@ -3272,8 +3272,8 @@ int32x4_t test_vqdmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
-// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
+// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
// CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]]
//
int64x2_t test_vqdmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
@@ -3288,8 +3288,8 @@ int64x2_t test_vqdmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]]
-// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
+// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
// CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]]
//
int32x4_t test_vqdmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
@@ -3304,8 +3304,8 @@ int32x4_t test_vqdmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]]
-// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
+// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
// CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]]
//
int64x2_t test_vqdmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
@@ -3321,8 +3321,8 @@ int64x2_t test_vqdmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
-// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
+// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
// CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]]
//
int32x4_t test_vqdmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
@@ -3338,8 +3338,8 @@ int32x4_t test_vqdmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
-// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
+// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
// CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]]
//
int64x2_t test_vqdmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
@@ -3353,7 +3353,7 @@ int64x2_t test_vqdmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
// CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]]
//
@@ -3368,7 +3368,7 @@ int32x4_t test_vqdmull_lane_s16_0(int16x4_t a, int16x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
// CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]]
//
@@ -3383,7 +3383,7 @@ int64x2_t test_vqdmull_lane_s32_0(int32x2_t a, int32x2_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]])
// CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]]
//
@@ -3398,7 +3398,7 @@ int32x4_t test_vqdmull_laneq_s16_0(int16x4_t a, int16x8_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]])
// CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]]
//
@@ -3414,7 +3414,7 @@ int64x2_t test_vqdmull_laneq_s32_0(int32x2_t a, int32x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
// CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]]
//
@@ -3430,7 +3430,7 @@ int32x4_t test_vqdmull_high_lane_s16_0(int16x8_t a, int16x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
// CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]]
//
@@ -3446,7 +3446,7 @@ int64x2_t test_vqdmull_high_lane_s32_0(int32x4_t a, int32x2_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
// CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]]
//
@@ -3462,7 +3462,7 @@ int32x4_t test_vqdmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
// CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]]
//
@@ -3656,7 +3656,7 @@ float64x2_t test_vmulq_laneq_f64_0(float64x2_t a, float64x2_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]])
// CHECK-NEXT: ret <2 x float> [[VMULX2_I]]
//
float32x2_t test_vmulx_lane_f32_0(float32x2_t a, float32x2_t v) {
@@ -3670,7 +3670,7 @@ float32x2_t test_vmulx_lane_f32_0(float32x2_t a, float32x2_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8>
-// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]])
// CHECK-NEXT: ret <4 x float> [[VMULX2_I]]
//
float32x4_t test_vmulxq_lane_f32_0(float32x4_t a, float32x2_t v) {
@@ -3684,7 +3684,7 @@ float32x4_t test_vmulxq_lane_f32_0(float32x4_t a, float32x2_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8>
-// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]])
// CHECK-NEXT: ret <2 x double> [[VMULX2_I]]
//
float64x2_t test_vmulxq_lane_f64_0(float64x2_t a, float64x1_t v) {
@@ -3698,7 +3698,7 @@ float64x2_t test_vmulxq_lane_f64_0(float64x2_t a, float64x1_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]])
// CHECK-NEXT: ret <2 x float> [[VMULX2_I]]
//
float32x2_t test_vmulx_laneq_f32_0(float32x2_t a, float32x4_t v) {
@@ -3712,7 +3712,7 @@ float32x2_t test_vmulx_laneq_f32_0(float32x2_t a, float32x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8>
-// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]])
// CHECK-NEXT: ret <4 x float> [[VMULX2_I]]
//
float32x4_t test_vmulxq_laneq_f32_0(float32x4_t a, float32x4_t v) {
@@ -3726,7 +3726,7 @@ float32x4_t test_vmulxq_laneq_f32_0(float32x4_t a, float32x4_t v) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> zeroinitializer
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8>
-// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]])
// CHECK-NEXT: ret <2 x double> [[VMULX2_I]]
//
float64x2_t test_vmulxq_laneq_f64_0(float64x2_t a, float64x2_t v) {
@@ -3742,7 +3742,7 @@ float64x2_t test_vmulxq_laneq_f64_0(float64x2_t a, float64x2_t v) {
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]])
// CHECK-NEXT: ret <4 x i32> [[VMULL2_I_I]]
//
int32x4_t test_vmull_high_n_s16(int16x8_t a, int16_t b) {
@@ -3756,7 +3756,7 @@ int32x4_t test_vmull_high_n_s16(int16x8_t a, int16_t b) {
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]])
// CHECK-NEXT: ret <2 x i64> [[VMULL2_I_I]]
//
int64x2_t test_vmull_high_n_s32(int32x4_t a, int32_t b) {
@@ -3772,7 +3772,7 @@ int64x2_t test_vmull_high_n_s32(int32x4_t a, int32_t b) {
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]])
// CHECK-NEXT: ret <4 x i32> [[VMULL2_I_I]]
//
uint32x4_t test_vmull_high_n_u16(uint16x8_t a, uint16_t b) {
@@ -3786,7 +3786,7 @@ uint32x4_t test_vmull_high_n_u16(uint16x8_t a, uint16_t b) {
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]])
// CHECK-NEXT: ret <2 x i64> [[VMULL2_I_I]]
//
uint64x2_t test_vmull_high_n_u32(uint32x4_t a, uint32_t b) {
@@ -3802,7 +3802,7 @@ uint64x2_t test_vmull_high_n_u32(uint32x4_t a, uint32_t b) {
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK-NEXT: [[VQDMULL_V2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMULL_V2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]])
// CHECK-NEXT: [[VQDMULL_V3_I_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I_I]] to <16 x i8>
// CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I_I]]
//
@@ -3817,7 +3817,7 @@ int32x4_t test_vqdmull_high_n_s16(int16x8_t a, int16_t b) {
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK-NEXT: [[VQDMULL_V2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMULL_V2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]])
// CHECK-NEXT: [[VQDMULL_V3_I_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I_I]] to <16 x i8>
// CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I_I]]
//
@@ -3834,7 +3834,7 @@ int64x2_t test_vqdmull_high_n_s32(int32x4_t a, int32_t b) {
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]])
// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
// CHECK-NEXT: ret <4 x i32> [[ADD_I]]
//
@@ -3849,7 +3849,7 @@ int32x4_t test_vmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]])
// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
// CHECK-NEXT: ret <2 x i64> [[ADD_I]]
//
@@ -3866,7 +3866,7 @@ int64x2_t test_vmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]])
// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
// CHECK-NEXT: ret <4 x i32> [[ADD_I]]
//
@@ -3881,7 +3881,7 @@ uint32x4_t test_vmlal_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) {
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]])
// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
// CHECK-NEXT: ret <2 x i64> [[ADD_I]]
//
@@ -3899,8 +3899,8 @@ uint64x2_t test_vmlal_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) {
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]]
-// CHECK-NEXT: [[VQDMLAL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]])
+// CHECK-NEXT: [[VQDMLAL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]])
// CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I_I]]
//
int32x4_t test_vqdmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
@@ -3915,8 +3915,8 @@ int32x4_t test_vqdmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]]
-// CHECK-NEXT: [[VQDMLAL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]])
+// CHECK-NEXT: [[VQDMLAL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]])
// CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I_I]]
//
int64x2_t test_vqdmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
@@ -3932,7 +3932,7 @@ int64x2_t test_vqdmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]])
// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
// CHECK-NEXT: ret <4 x i32> [[SUB_I]]
//
@@ -3947,7 +3947,7 @@ int32x4_t test_vmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]])
// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
// CHECK-NEXT: ret <2 x i64> [[SUB_I]]
//
@@ -3964,7 +3964,7 @@ int64x2_t test_vmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]])
// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
// CHECK-NEXT: ret <4 x i32> [[SUB_I]]
//
@@ -3979,7 +3979,7 @@ uint32x4_t test_vmlsl_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) {
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]])
// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
// CHECK-NEXT: ret <2 x i64> [[SUB_I]]
//
@@ -3997,8 +3997,8 @@ uint64x2_t test_vmlsl_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) {
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]]
-// CHECK-NEXT: [[VQDMLSL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I_I]], <4 x i16> [[VECINIT3_I]])
+// CHECK-NEXT: [[VQDMLSL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]])
// CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I_I]]
//
int32x4_t test_vqdmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
@@ -4013,8 +4013,8 @@ int32x4_t test_vqdmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]]
-// CHECK-NEXT: [[VQDMLSL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I_I]], <2 x i32> [[VECINIT1_I]])
+// CHECK-NEXT: [[VQDMLSL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]])
// CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I_I]]
//
int64x2_t test_vqdmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
@@ -4063,7 +4063,7 @@ float64x2_t test_vmulq_n_f64(float64x2_t a, float64_t b) {
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8>
-// CHECK-NEXT: [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[B]], <2 x float> [[VECINIT1_I]], <2 x float> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[B]], <2 x float> [[VECINIT1_I]], <2 x float> [[A]])
// CHECK-NEXT: ret <2 x float> [[TMP3]]
//
float32x2_t test_vfma_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
@@ -4076,7 +4076,7 @@ float32x2_t test_vfma_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[VECINIT_I]] to <8 x i8>
-// CHECK-NEXT: [[TMP3:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[B]], <1 x double> [[VECINIT_I]], <1 x double> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[TMP3:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[B]], <1 x double> [[VECINIT_I]], <1 x double> [[A]])
// CHECK-NEXT: ret <1 x double> [[TMP3]]
//
float64x1_t test_vfma_n_f64(float64x1_t a, float64x1_t b, float64_t n) {
@@ -4092,7 +4092,7 @@ float64x1_t test_vfma_n_f64(float64x1_t a, float64x1_t b, float64_t n) {
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8>
-// CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[B]], <4 x float> [[VECINIT3_I]], <4 x float> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[B]], <4 x float> [[VECINIT3_I]], <4 x float> [[A]])
// CHECK-NEXT: ret <4 x float> [[TMP3]]
//
float32x4_t test_vfmaq_n_f32(float32x4_t a, float32x4_t b, float32_t n) {
@@ -4107,7 +4107,7 @@ float32x4_t test_vfmaq_n_f32(float32x4_t a, float32x4_t b, float32_t n) {
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[FNEG_I]] to <8 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8>
-// CHECK-NEXT: [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FNEG_I]], <2 x float> [[VECINIT1_I]], <2 x float> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FNEG_I]], <2 x float> [[VECINIT1_I]], <2 x float> [[A]])
// CHECK-NEXT: ret <2 x float> [[TMP3]]
//
float32x2_t test_vfms_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
@@ -4121,7 +4121,7 @@ float32x2_t test_vfms_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x double> [[FNEG_I]] to <8 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x double> [[VECINIT_I]] to <8 x i8>
-// CHECK-NEXT: [[TMP3:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FNEG_I]], <1 x double> [[VECINIT_I]], <1 x double> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[TMP3:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FNEG_I]], <1 x double> [[VECINIT_I]], <1 x double> [[A]])
// CHECK-NEXT: ret <1 x double> [[TMP3]]
//
float64x1_t test_vfms_n_f64(float64x1_t a, float64x1_t b, float64_t n) {
@@ -4138,7 +4138,7 @@ float64x1_t test_vfms_n_f64(float64x1_t a, float64x1_t b, float64_t n) {
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[FNEG_I]] to <16 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8>
-// CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FNEG_I]], <4 x float> [[VECINIT3_I]], <4 x float> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FNEG_I]], <4 x float> [[VECINIT3_I]], <4 x float> [[A]])
// CHECK-NEXT: ret <4 x float> [[TMP3]]
//
float32x4_t test_vfmsq_n_f32(float32x4_t a, float32x4_t b, float32_t n) {
@@ -4261,7 +4261,7 @@ uint32x4_t test_vmulq_n_u32(uint32x4_t a, uint32_t b) {
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]])
// CHECK-NEXT: ret <4 x i32> [[VMULL2_I_I]]
//
int32x4_t test_vmull_n_s16(int16x4_t a, int16_t b) {
@@ -4274,7 +4274,7 @@ int32x4_t test_vmull_n_s16(int16x4_t a, int16_t b) {
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]])
// CHECK-NEXT: ret <2 x i64> [[VMULL2_I_I]]
//
int64x2_t test_vmull_n_s32(int32x2_t a, int32_t b) {
@@ -4289,7 +4289,7 @@ int64x2_t test_vmull_n_s32(int32x2_t a, int32_t b) {
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]])
// CHECK-NEXT: ret <4 x i32> [[VMULL2_I_I]]
//
uint32x4_t test_vmull_n_u16(uint16x4_t a, uint16_t b) {
@@ -4302,7 +4302,7 @@ uint32x4_t test_vmull_n_u16(uint16x4_t a, uint16_t b) {
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]])
// CHECK-NEXT: ret <2 x i64> [[VMULL2_I_I]]
//
uint64x2_t test_vmull_n_u32(uint32x2_t a, uint32_t b) {
@@ -4317,7 +4317,7 @@ uint64x2_t test_vmull_n_u32(uint32x2_t a, uint32_t b) {
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK-NEXT: [[VQDMULL_V2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMULL_V2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]])
// CHECK-NEXT: [[VQDMULL_V3_I_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I_I]] to <16 x i8>
// CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I_I]]
//
@@ -4331,7 +4331,7 @@ int32x4_t test_vqdmull_n_s16(int16x4_t a, int16_t b) {
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK-NEXT: [[VQDMULL_V2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMULL_V2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]])
// CHECK-NEXT: [[VQDMULL_V3_I_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I_I]] to <16 x i8>
// CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I_I]]
//
@@ -4347,7 +4347,7 @@ int64x2_t test_vqdmull_n_s32(int32x2_t a, int32_t b) {
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]])
// CHECK-NEXT: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
// CHECK-NEXT: ret <4 x i16> [[VQDMULH_V2_I]]
//
@@ -4367,7 +4367,7 @@ int16x4_t test_vqdmulh_n_s16(int16x4_t a, int16_t b) {
// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[B]], i32 7
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
-// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[VECINIT7_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[VECINIT7_I]])
// CHECK-NEXT: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <8 x i16> [[VQDMULHQ_V2_I]]
//
@@ -4381,7 +4381,7 @@ int16x8_t test_vqdmulhq_n_s16(int16x8_t a, int16_t b) {
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]])
// CHECK-NEXT: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
// CHECK-NEXT: ret <2 x i32> [[VQDMULH_V2_I]]
//
@@ -4397,7 +4397,7 @@ int32x2_t test_vqdmulh_n_s32(int32x2_t a, int32_t b) {
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[B]], i32 3
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
-// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[VECINIT3_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[VECINIT3_I]])
// CHECK-NEXT: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <4 x i32> [[VQDMULHQ_V2_I]]
//
@@ -4413,7 +4413,7 @@ int32x4_t test_vqdmulhq_n_s32(int32x4_t a, int32_t b) {
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[B]], i32 3
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[VECINIT3_I]])
// CHECK-NEXT: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
// CHECK-NEXT: ret <4 x i16> [[VQRDMULH_V2_I]]
//
@@ -4433,7 +4433,7 @@ int16x4_t test_vqrdmulh_n_s16(int16x4_t a, int16_t b) {
// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 [[B]], i32 7
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
-// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[VECINIT7_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[VECINIT7_I]])
// CHECK-NEXT: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <8 x i16> [[VQRDMULHQ_V2_I]]
//
@@ -4447,7 +4447,7 @@ int16x8_t test_vqrdmulhq_n_s16(int16x8_t a, int16_t b) {
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[B]], i32 1
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[VECINIT1_I]])
// CHECK-NEXT: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
// CHECK-NEXT: ret <2 x i32> [[VQRDMULH_V2_I]]
//
@@ -4463,7 +4463,7 @@ int32x2_t test_vqrdmulh_n_s32(int32x2_t a, int32_t b) {
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 [[B]], i32 3
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
-// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[VECINIT3_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[VECINIT3_I]])
// CHECK-NEXT: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <4 x i32> [[VQRDMULHQ_V2_I]]
//
@@ -4595,7 +4595,7 @@ uint32x4_t test_vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]])
// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
// CHECK-NEXT: ret <4 x i32> [[ADD_I]]
//
@@ -4609,7 +4609,7 @@ int32x4_t test_vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]])
// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
// CHECK-NEXT: ret <2 x i64> [[ADD_I]]
//
@@ -4625,7 +4625,7 @@ int64x2_t test_vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]])
// CHECK-NEXT: [[ADD_I:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
// CHECK-NEXT: ret <4 x i32> [[ADD_I]]
//
@@ -4639,7 +4639,7 @@ uint32x4_t test_vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]])
// CHECK-NEXT: [[ADD_I:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
// CHECK-NEXT: ret <2 x i64> [[ADD_I]]
//
@@ -4656,8 +4656,8 @@ uint64x2_t test_vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]]
-// CHECK-NEXT: [[VQDMLAL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]])
+// CHECK-NEXT: [[VQDMLAL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]])
// CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I_I]]
//
int32x4_t test_vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
@@ -4671,8 +4671,8 @@ int32x4_t test_vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]]
-// CHECK-NEXT: [[VQDMLAL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]])
+// CHECK-NEXT: [[VQDMLAL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]])
// CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I_I]]
//
int64x2_t test_vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
@@ -4803,7 +4803,7 @@ uint32x4_t test_vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]])
// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
// CHECK-NEXT: ret <4 x i32> [[SUB_I]]
//
@@ -4817,7 +4817,7 @@ int32x4_t test_vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]])
// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
// CHECK-NEXT: ret <2 x i64> [[SUB_I]]
//
@@ -4833,7 +4833,7 @@ int64x2_t test_vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 [[C]], i32 3
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]])
// CHECK-NEXT: [[SUB_I:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I_I]]
// CHECK-NEXT: ret <4 x i32> [[SUB_I]]
//
@@ -4847,7 +4847,7 @@ uint32x4_t test_vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 [[C]], i32 1
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]])
// CHECK-NEXT: [[SUB_I:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I_I]]
// CHECK-NEXT: ret <2 x i64> [[SUB_I]]
//
@@ -4864,8 +4864,8 @@ uint64x2_t test_vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
-// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]]) #[[ATTR4]]
-// CHECK-NEXT: [[VQDMLSL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[VECINIT3_I]])
+// CHECK-NEXT: [[VQDMLSL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I_I]])
// CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I_I]]
//
int32x4_t test_vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
@@ -4879,8 +4879,8 @@ int32x4_t test_vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
-// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]]) #[[ATTR4]]
-// CHECK-NEXT: [[VQDMLSL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[VECINIT1_I]])
+// CHECK-NEXT: [[VQDMLSL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I_I]])
// CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I_I]]
//
int64x2_t test_vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
@@ -4999,8 +4999,8 @@ uint32x4_t test_vmlaq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]]
-// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
+// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
// CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]]
//
int32x4_t test_vqdmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
@@ -5015,8 +5015,8 @@ int32x4_t test_vqdmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]]
-// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
+// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
// CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]]
//
int64x2_t test_vqdmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
@@ -5032,8 +5032,8 @@ int64x2_t test_vqdmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
-// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
+// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
// CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]]
//
int32x4_t test_vqdmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
@@ -5049,8 +5049,8 @@ int32x4_t test_vqdmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
-// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
+// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
// CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]]
//
int64x2_t test_vqdmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
@@ -5169,8 +5169,8 @@ uint32x4_t test_vmlsq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]]
-// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
+// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
// CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]]
//
int32x4_t test_vqdmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
@@ -5185,8 +5185,8 @@ int32x4_t test_vqdmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]]
-// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
+// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
// CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]]
//
int64x2_t test_vqdmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
@@ -5202,8 +5202,8 @@ int64x2_t test_vqdmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
-// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
+// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
// CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]]
//
int32x4_t test_vqdmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
@@ -5219,8 +5219,8 @@ int32x4_t test_vqdmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
-// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
+// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
// CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]]
//
int64x2_t test_vqdmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
@@ -5443,8 +5443,8 @@ uint32x4_t test_vmlaq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]]
-// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
+// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
// CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]]
//
int32x4_t test_vqdmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
@@ -5459,8 +5459,8 @@ int32x4_t test_vqdmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]]
-// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
+// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
// CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]]
//
int64x2_t test_vqdmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
@@ -5476,8 +5476,8 @@ int64x2_t test_vqdmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
-// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
+// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
// CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]]
//
int32x4_t test_vqdmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
@@ -5493,8 +5493,8 @@ int32x4_t test_vqdmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
-// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
+// CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
// CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]]
//
int64x2_t test_vqdmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
@@ -5613,8 +5613,8 @@ uint32x4_t test_vmlsq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #[[ATTR4]]
-// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]])
+// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
// CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]]
//
int32x4_t test_vqdmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
@@ -5629,8 +5629,8 @@ int32x4_t test_vqdmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #[[ATTR4]]
-// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]])
+// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
// CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]]
//
int64x2_t test_vqdmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
@@ -5646,8 +5646,8 @@ int64x2_t test_vqdmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #[[ATTR4]]
-// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]])
+// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]])
// CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]]
//
int32x4_t test_vqdmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
@@ -5663,8 +5663,8 @@ int32x4_t test_vqdmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #[[ATTR4]]
-// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]])
+// CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]])
// CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]]
//
int64x2_t test_vqdmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
diff --git a/clang/test/CodeGen/aarch64-neon-across.c b/clang/test/CodeGen/aarch64-neon-across.c
index 6cc58ca970271..a18845d084a2e 100644
--- a/clang/test/CodeGen/aarch64-neon-across.c
+++ b/clang/test/CodeGen/aarch64-neon-across.c
@@ -9,7 +9,7 @@
// CHECK-LABEL: define {{[^@]+}}@test_vaddlv_s8
// CHECK-SAME: (<8 x i8> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v8i8(<8 x i8> [[A]]) #[[ATTR3:[0-9]+]]
+// CHECK-NEXT: [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v8i8(<8 x i8> [[A]])
// CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VADDLV_I]] to i16
// CHECK-NEXT: ret i16 [[TMP0]]
//
@@ -20,7 +20,7 @@ int16_t test_vaddlv_s8(int8x8_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vaddlv_s16
// CHECK-SAME: (<4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v4i16(<4 x i16> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v4i16(<4 x i16> [[A]])
// CHECK-NEXT: ret i32 [[VADDLV_I]]
//
int32_t test_vaddlv_s16(int16x4_t a) {
@@ -30,7 +30,7 @@ int32_t test_vaddlv_s16(int16x4_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vaddlv_u8
// CHECK-SAME: (<8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v8i8(<8 x i8> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v8i8(<8 x i8> [[A]])
// CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VADDLV_I]] to i16
// CHECK-NEXT: ret i16 [[TMP0]]
//
@@ -41,7 +41,7 @@ uint16_t test_vaddlv_u8(uint8x8_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vaddlv_u16
// CHECK-SAME: (<4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v4i16(<4 x i16> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v4i16(<4 x i16> [[A]])
// CHECK-NEXT: ret i32 [[VADDLV_I]]
//
uint32_t test_vaddlv_u16(uint16x4_t a) {
@@ -51,7 +51,7 @@ uint32_t test_vaddlv_u16(uint16x4_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vaddlvq_s8
// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR1:[0-9]+]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v16i8(<16 x i8> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v16i8(<16 x i8> [[A]])
// CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VADDLV_I]] to i16
// CHECK-NEXT: ret i16 [[TMP0]]
//
@@ -62,7 +62,7 @@ int16_t test_vaddlvq_s8(int8x16_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vaddlvq_s16
// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v8i16(<8 x i16> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v8i16(<8 x i16> [[A]])
// CHECK-NEXT: ret i32 [[VADDLV_I]]
//
int32_t test_vaddlvq_s16(int16x8_t a) {
@@ -72,7 +72,7 @@ int32_t test_vaddlvq_s16(int16x8_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vaddlvq_s32
// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VADDLVQ_S32_I:%.*]] = call i64 @llvm.aarch64.neon.saddlv.i64.v4i32(<4 x i32> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VADDLVQ_S32_I:%.*]] = call i64 @llvm.aarch64.neon.saddlv.i64.v4i32(<4 x i32> [[A]])
// CHECK-NEXT: ret i64 [[VADDLVQ_S32_I]]
//
int64_t test_vaddlvq_s32(int32x4_t a) {
@@ -82,7 +82,7 @@ int64_t test_vaddlvq_s32(int32x4_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vaddlvq_u8
// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v16i8(<16 x i8> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v16i8(<16 x i8> [[A]])
// CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VADDLV_I]] to i16
// CHECK-NEXT: ret i16 [[TMP0]]
//
@@ -93,7 +93,7 @@ uint16_t test_vaddlvq_u8(uint8x16_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vaddlvq_u16
// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16> [[A]])
// CHECK-NEXT: ret i32 [[VADDLV_I]]
//
uint32_t test_vaddlvq_u16(uint16x8_t a) {
@@ -103,7 +103,7 @@ uint32_t test_vaddlvq_u16(uint16x8_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vaddlvq_u32
// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VADDLVQ_U32_I:%.*]] = call i64 @llvm.aarch64.neon.uaddlv.i64.v4i32(<4 x i32> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VADDLVQ_U32_I:%.*]] = call i64 @llvm.aarch64.neon.uaddlv.i64.v4i32(<4 x i32> [[A]])
// CHECK-NEXT: ret i64 [[VADDLVQ_U32_I]]
//
uint64_t test_vaddlvq_u32(uint32x4_t a) {
@@ -113,7 +113,7 @@ uint64_t test_vaddlvq_u32(uint32x4_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vmaxv_s8
// CHECK-SAME: (<8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v8i8(<8 x i8> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v8i8(<8 x i8> [[A]])
// CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i8
// CHECK-NEXT: ret i8 [[TMP0]]
//
@@ -124,7 +124,7 @@ int8_t test_vmaxv_s8(int8x8_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vmaxv_s16
// CHECK-SAME: (<4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v4i16(<4 x i16> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v4i16(<4 x i16> [[A]])
// CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i16
// CHECK-NEXT: ret i16 [[TMP0]]
//
@@ -135,7 +135,7 @@ int16_t test_vmaxv_s16(int16x4_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vmaxv_u8
// CHECK-SAME: (<8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8> [[A]])
// CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i8
// CHECK-NEXT: ret i8 [[TMP0]]
//
@@ -146,7 +146,7 @@ uint8_t test_vmaxv_u8(uint8x8_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vmaxv_u16
// CHECK-SAME: (<4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v4i16(<4 x i16> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v4i16(<4 x i16> [[A]])
// CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i16
// CHECK-NEXT: ret i16 [[TMP0]]
//
@@ -157,7 +157,7 @@ uint16_t test_vmaxv_u16(uint16x4_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vmaxvq_s8
// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v16i8(<16 x i8> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v16i8(<16 x i8> [[A]])
// CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i8
// CHECK-NEXT: ret i8 [[TMP0]]
//
@@ -168,7 +168,7 @@ int8_t test_vmaxvq_s8(int8x16_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vmaxvq_s16
// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v8i16(<8 x i16> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v8i16(<8 x i16> [[A]])
// CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i16
// CHECK-NEXT: ret i16 [[TMP0]]
//
@@ -179,7 +179,7 @@ int16_t test_vmaxvq_s16(int16x8_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vmaxvq_s32
// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VMAXVQ_S32_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v4i32(<4 x i32> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VMAXVQ_S32_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v4i32(<4 x i32> [[A]])
// CHECK-NEXT: ret i32 [[VMAXVQ_S32_I]]
//
int32_t test_vmaxvq_s32(int32x4_t a) {
@@ -189,7 +189,7 @@ int32_t test_vmaxvq_s32(int32x4_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vmaxvq_u8
// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8> [[A]])
// CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i8
// CHECK-NEXT: ret i8 [[TMP0]]
//
@@ -200,7 +200,7 @@ uint8_t test_vmaxvq_u8(uint8x16_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vmaxvq_u16
// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v8i16(<8 x i16> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v8i16(<8 x i16> [[A]])
// CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i16
// CHECK-NEXT: ret i16 [[TMP0]]
//
@@ -211,7 +211,7 @@ uint16_t test_vmaxvq_u16(uint16x8_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vmaxvq_u32
// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VMAXVQ_U32_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v4i32(<4 x i32> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VMAXVQ_U32_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v4i32(<4 x i32> [[A]])
// CHECK-NEXT: ret i32 [[VMAXVQ_U32_I]]
//
uint32_t test_vmaxvq_u32(uint32x4_t a) {
@@ -221,7 +221,7 @@ uint32_t test_vmaxvq_u32(uint32x4_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vminv_s8
// CHECK-SAME: (<8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v8i8(<8 x i8> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v8i8(<8 x i8> [[A]])
// CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i8
// CHECK-NEXT: ret i8 [[TMP0]]
//
@@ -232,7 +232,7 @@ int8_t test_vminv_s8(int8x8_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vminv_s16
// CHECK-SAME: (<4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v4i16(<4 x i16> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v4i16(<4 x i16> [[A]])
// CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i16
// CHECK-NEXT: ret i16 [[TMP0]]
//
@@ -243,7 +243,7 @@ int16_t test_vminv_s16(int16x4_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vminv_u8
// CHECK-SAME: (<8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8> [[A]])
// CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i8
// CHECK-NEXT: ret i8 [[TMP0]]
//
@@ -254,7 +254,7 @@ uint8_t test_vminv_u8(uint8x8_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vminv_u16
// CHECK-SAME: (<4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v4i16(<4 x i16> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v4i16(<4 x i16> [[A]])
// CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i16
// CHECK-NEXT: ret i16 [[TMP0]]
//
@@ -265,7 +265,7 @@ uint16_t test_vminv_u16(uint16x4_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vminvq_s8
// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v16i8(<16 x i8> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v16i8(<16 x i8> [[A]])
// CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i8
// CHECK-NEXT: ret i8 [[TMP0]]
//
@@ -276,7 +276,7 @@ int8_t test_vminvq_s8(int8x16_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vminvq_s16
// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v8i16(<8 x i16> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v8i16(<8 x i16> [[A]])
// CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i16
// CHECK-NEXT: ret i16 [[TMP0]]
//
@@ -287,7 +287,7 @@ int16_t test_vminvq_s16(int16x8_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vminvq_s32
// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VMINVQ_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v4i32(<4 x i32> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VMINVQ_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v4i32(<4 x i32> [[A]])
// CHECK-NEXT: ret i32 [[VMINVQ_S32_I]]
//
int32_t test_vminvq_s32(int32x4_t a) {
@@ -297,7 +297,7 @@ int32_t test_vminvq_s32(int32x4_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vminvq_u8
// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8> [[A]])
// CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i8
// CHECK-NEXT: ret i8 [[TMP0]]
//
@@ -308,7 +308,7 @@ uint8_t test_vminvq_u8(uint8x16_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vminvq_u16
// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v8i16(<8 x i16> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v8i16(<8 x i16> [[A]])
// CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i16
// CHECK-NEXT: ret i16 [[TMP0]]
//
@@ -319,7 +319,7 @@ uint16_t test_vminvq_u16(uint16x8_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vminvq_u32
// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VMINVQ_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v4i32(<4 x i32> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VMINVQ_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v4i32(<4 x i32> [[A]])
// CHECK-NEXT: ret i32 [[VMINVQ_U32_I]]
//
uint32_t test_vminvq_u32(uint32x4_t a) {
@@ -329,7 +329,7 @@ uint32_t test_vminvq_u32(uint32x4_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vaddv_s8
// CHECK-SAME: (<8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8> [[A]])
// CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i8
// CHECK-NEXT: ret i8 [[TMP0]]
//
@@ -340,7 +340,7 @@ int8_t test_vaddv_s8(int8x8_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vaddv_s16
// CHECK-SAME: (<4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16> [[A]])
// CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i16
// CHECK-NEXT: ret i16 [[TMP0]]
//
@@ -351,7 +351,7 @@ int16_t test_vaddv_s16(int16x4_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vaddv_u8
// CHECK-SAME: (<8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8> [[A]])
// CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i8
// CHECK-NEXT: ret i8 [[TMP0]]
//
@@ -362,7 +362,7 @@ uint8_t test_vaddv_u8(uint8x8_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vaddv_u16
// CHECK-SAME: (<4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16> [[A]])
// CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i16
// CHECK-NEXT: ret i16 [[TMP0]]
//
@@ -373,7 +373,7 @@ uint16_t test_vaddv_u16(uint16x4_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vaddvq_s8
// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8> [[A]])
// CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i8
// CHECK-NEXT: ret i8 [[TMP0]]
//
@@ -384,7 +384,7 @@ int8_t test_vaddvq_s8(int8x16_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vaddvq_s16
// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16> [[A]])
// CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i16
// CHECK-NEXT: ret i16 [[TMP0]]
//
@@ -395,7 +395,7 @@ int16_t test_vaddvq_s16(int16x8_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vaddvq_s32
// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VADDVQ_S32_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VADDVQ_S32_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32> [[A]])
// CHECK-NEXT: ret i32 [[VADDVQ_S32_I]]
//
int32_t test_vaddvq_s32(int32x4_t a) {
@@ -405,7 +405,7 @@ int32_t test_vaddvq_s32(int32x4_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vaddvq_u8
// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v16i8(<16 x i8> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v16i8(<16 x i8> [[A]])
// CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i8
// CHECK-NEXT: ret i8 [[TMP0]]
//
@@ -416,7 +416,7 @@ uint8_t test_vaddvq_u8(uint8x16_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vaddvq_u16
// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v8i16(<8 x i16> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v8i16(<8 x i16> [[A]])
// CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i16
// CHECK-NEXT: ret i16 [[TMP0]]
//
@@ -427,7 +427,7 @@ uint16_t test_vaddvq_u16(uint16x8_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vaddvq_u32
// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VADDVQ_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v4i32(<4 x i32> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VADDVQ_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v4i32(<4 x i32> [[A]])
// CHECK-NEXT: ret i32 [[VADDVQ_U32_I]]
//
uint32_t test_vaddvq_u32(uint32x4_t a) {
@@ -437,7 +437,7 @@ uint32_t test_vaddvq_u32(uint32x4_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vmaxvq_f32
// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VMAXVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxv.f32.v4f32(<4 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VMAXVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxv.f32.v4f32(<4 x float> [[A]])
// CHECK-NEXT: ret float [[VMAXVQ_F32_I]]
//
float32_t test_vmaxvq_f32(float32x4_t a) {
@@ -447,7 +447,7 @@ float32_t test_vmaxvq_f32(float32x4_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vminvq_f32
// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VMINVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fminv.f32.v4f32(<4 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VMINVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fminv.f32.v4f32(<4 x float> [[A]])
// CHECK-NEXT: ret float [[VMINVQ_F32_I]]
//
float32_t test_vminvq_f32(float32x4_t a) {
@@ -457,7 +457,7 @@ float32_t test_vminvq_f32(float32x4_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vmaxnmvq_f32
// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VMAXNMVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxnmv.f32.v4f32(<4 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VMAXNMVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxnmv.f32.v4f32(<4 x float> [[A]])
// CHECK-NEXT: ret float [[VMAXNMVQ_F32_I]]
//
float32_t test_vmaxnmvq_f32(float32x4_t a) {
@@ -467,7 +467,7 @@ float32_t test_vmaxnmvq_f32(float32x4_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vminnmvq_f32
// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VMINNMVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fminnmv.f32.v4f32(<4 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VMINNMVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fminnmv.f32.v4f32(<4 x float> [[A]])
// CHECK-NEXT: ret float [[VMINNMVQ_F32_I]]
//
float32_t test_vminnmvq_f32(float32x4_t a) {
diff --git a/clang/test/CodeGen/aarch64-neon-fcvt-intrinsics.c b/clang/test/CodeGen/aarch64-neon-fcvt-intrinsics.c
index e2407544e70a2..c548a255c7137 100644
--- a/clang/test/CodeGen/aarch64-neon-fcvt-intrinsics.c
+++ b/clang/test/CodeGen/aarch64-neon-fcvt-intrinsics.c
@@ -9,7 +9,7 @@
// CHECK-LABEL: define {{[^@]+}}@test_vcvtxd_f32_f64
// CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VCVTXD_F32_F64_I:%.*]] = call float @llvm.aarch64.sisd.fcvtxn(double [[A]]) #[[ATTR2:[0-9]+]]
+// CHECK-NEXT: [[VCVTXD_F32_F64_I:%.*]] = call float @llvm.aarch64.sisd.fcvtxn(double [[A]])
// CHECK-NEXT: ret float [[VCVTXD_F32_F64_I]]
//
float32_t test_vcvtxd_f32_f64(float64_t a) {
@@ -19,7 +19,7 @@ float32_t test_vcvtxd_f32_f64(float64_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vcvtas_s32_f32
// CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VCVTAS_S32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtas.i32.f32(float [[A]]) #[[ATTR2]]
+// CHECK-NEXT: [[VCVTAS_S32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtas.i32.f32(float [[A]])
// CHECK-NEXT: ret i32 [[VCVTAS_S32_F32_I]]
//
int32_t test_vcvtas_s32_f32(float32_t a) {
@@ -29,7 +29,7 @@ int32_t test_vcvtas_s32_f32(float32_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_test_vcvtad_s64_f64
// CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VCVTAD_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtas.i64.f64(double [[A]]) #[[ATTR2]]
+// CHECK-NEXT: [[VCVTAD_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtas.i64.f64(double [[A]])
// CHECK-NEXT: ret i64 [[VCVTAD_S64_F64_I]]
//
int64_t test_test_vcvtad_s64_f64(float64_t a) {
@@ -39,7 +39,7 @@ int64_t test_test_vcvtad_s64_f64(float64_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vcvtas_u32_f32
// CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VCVTAS_U32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtau.i32.f32(float [[A]]) #[[ATTR2]]
+// CHECK-NEXT: [[VCVTAS_U32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtau.i32.f32(float [[A]])
// CHECK-NEXT: ret i32 [[VCVTAS_U32_F32_I]]
//
uint32_t test_vcvtas_u32_f32(float32_t a) {
@@ -49,7 +49,7 @@ uint32_t test_vcvtas_u32_f32(float32_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vcvtad_u64_f64
// CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VCVTAD_U64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtau.i64.f64(double [[A]]) #[[ATTR2]]
+// CHECK-NEXT: [[VCVTAD_U64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtau.i64.f64(double [[A]])
// CHECK-NEXT: ret i64 [[VCVTAD_U64_F64_I]]
//
uint64_t test_vcvtad_u64_f64(float64_t a) {
@@ -59,7 +59,7 @@ uint64_t test_vcvtad_u64_f64(float64_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vcvtms_s32_f32
// CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VCVTMS_S32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtms.i32.f32(float [[A]]) #[[ATTR2]]
+// CHECK-NEXT: [[VCVTMS_S32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtms.i32.f32(float [[A]])
// CHECK-NEXT: ret i32 [[VCVTMS_S32_F32_I]]
//
int32_t test_vcvtms_s32_f32(float32_t a) {
@@ -69,7 +69,7 @@ int32_t test_vcvtms_s32_f32(float32_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vcvtmd_s64_f64
// CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VCVTMD_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtms.i64.f64(double [[A]]) #[[ATTR2]]
+// CHECK-NEXT: [[VCVTMD_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtms.i64.f64(double [[A]])
// CHECK-NEXT: ret i64 [[VCVTMD_S64_F64_I]]
//
int64_t test_vcvtmd_s64_f64(float64_t a) {
@@ -79,7 +79,7 @@ int64_t test_vcvtmd_s64_f64(float64_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vcvtms_u32_f32
// CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VCVTMS_U32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtmu.i32.f32(float [[A]]) #[[ATTR2]]
+// CHECK-NEXT: [[VCVTMS_U32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtmu.i32.f32(float [[A]])
// CHECK-NEXT: ret i32 [[VCVTMS_U32_F32_I]]
//
uint32_t test_vcvtms_u32_f32(float32_t a) {
@@ -89,7 +89,7 @@ uint32_t test_vcvtms_u32_f32(float32_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vcvtmd_u64_f64
// CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VCVTMD_U64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtmu.i64.f64(double [[A]]) #[[ATTR2]]
+// CHECK-NEXT: [[VCVTMD_U64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtmu.i64.f64(double [[A]])
// CHECK-NEXT: ret i64 [[VCVTMD_U64_F64_I]]
//
uint64_t test_vcvtmd_u64_f64(float64_t a) {
@@ -99,7 +99,7 @@ uint64_t test_vcvtmd_u64_f64(float64_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vcvtns_s32_f32
// CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VCVTNS_S32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtns.i32.f32(float [[A]]) #[[ATTR2]]
+// CHECK-NEXT: [[VCVTNS_S32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtns.i32.f32(float [[A]])
// CHECK-NEXT: ret i32 [[VCVTNS_S32_F32_I]]
//
int32_t test_vcvtns_s32_f32(float32_t a) {
@@ -109,7 +109,7 @@ int32_t test_vcvtns_s32_f32(float32_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vcvtnd_s64_f64
// CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VCVTND_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtns.i64.f64(double [[A]]) #[[ATTR2]]
+// CHECK-NEXT: [[VCVTND_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtns.i64.f64(double [[A]])
// CHECK-NEXT: ret i64 [[VCVTND_S64_F64_I]]
//
int64_t test_vcvtnd_s64_f64(float64_t a) {
@@ -119,7 +119,7 @@ int64_t test_vcvtnd_s64_f64(float64_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vcvtns_u32_f32
// CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VCVTNS_U32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtnu.i32.f32(float [[A]]) #[[ATTR2]]
+// CHECK-NEXT: [[VCVTNS_U32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtnu.i32.f32(float [[A]])
// CHECK-NEXT: ret i32 [[VCVTNS_U32_F32_I]]
//
uint32_t test_vcvtns_u32_f32(float32_t a) {
@@ -129,7 +129,7 @@ uint32_t test_vcvtns_u32_f32(float32_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vcvtnd_u64_f64
// CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VCVTND_U64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtnu.i64.f64(double [[A]]) #[[ATTR2]]
+// CHECK-NEXT: [[VCVTND_U64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtnu.i64.f64(double [[A]])
// CHECK-NEXT: ret i64 [[VCVTND_U64_F64_I]]
//
uint64_t test_vcvtnd_u64_f64(float64_t a) {
@@ -139,7 +139,7 @@ uint64_t test_vcvtnd_u64_f64(float64_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vcvtps_s32_f32
// CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VCVTPS_S32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtps.i32.f32(float [[A]]) #[[ATTR2]]
+// CHECK-NEXT: [[VCVTPS_S32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtps.i32.f32(float [[A]])
// CHECK-NEXT: ret i32 [[VCVTPS_S32_F32_I]]
//
int32_t test_vcvtps_s32_f32(float32_t a) {
@@ -149,7 +149,7 @@ int32_t test_vcvtps_s32_f32(float32_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vcvtpd_s64_f64
// CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VCVTPD_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtps.i64.f64(double [[A]]) #[[ATTR2]]
+// CHECK-NEXT: [[VCVTPD_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtps.i64.f64(double [[A]])
// CHECK-NEXT: ret i64 [[VCVTPD_S64_F64_I]]
//
int64_t test_vcvtpd_s64_f64(float64_t a) {
@@ -159,7 +159,7 @@ int64_t test_vcvtpd_s64_f64(float64_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vcvtps_u32_f32
// CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VCVTPS_U32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtpu.i32.f32(float [[A]]) #[[ATTR2]]
+// CHECK-NEXT: [[VCVTPS_U32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtpu.i32.f32(float [[A]])
// CHECK-NEXT: ret i32 [[VCVTPS_U32_F32_I]]
//
uint32_t test_vcvtps_u32_f32(float32_t a) {
@@ -169,7 +169,7 @@ uint32_t test_vcvtps_u32_f32(float32_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vcvtpd_u64_f64
// CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VCVTPD_U64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtpu.i64.f64(double [[A]]) #[[ATTR2]]
+// CHECK-NEXT: [[VCVTPD_U64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtpu.i64.f64(double [[A]])
// CHECK-NEXT: ret i64 [[VCVTPD_U64_F64_I]]
//
uint64_t test_vcvtpd_u64_f64(float64_t a) {
@@ -179,7 +179,7 @@ uint64_t test_vcvtpd_u64_f64(float64_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vcvts_s32_f32
// CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VCVTS_S32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtzs.i32.f32(float [[A]]) #[[ATTR2]]
+// CHECK-NEXT: [[VCVTS_S32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtzs.i32.f32(float [[A]])
// CHECK-NEXT: ret i32 [[VCVTS_S32_F32_I]]
//
int32_t test_vcvts_s32_f32(float32_t a) {
@@ -189,7 +189,7 @@ int32_t test_vcvts_s32_f32(float32_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vcvtd_s64_f64
// CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VCVTD_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtzs.i64.f64(double [[A]]) #[[ATTR2]]
+// CHECK-NEXT: [[VCVTD_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtzs.i64.f64(double [[A]])
// CHECK-NEXT: ret i64 [[VCVTD_S64_F64_I]]
//
int64_t test_vcvtd_s64_f64(float64_t a) {
@@ -199,7 +199,7 @@ int64_t test_vcvtd_s64_f64(float64_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vcvts_u32_f32
// CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VCVTS_U32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtzu.i32.f32(float [[A]]) #[[ATTR2]]
+// CHECK-NEXT: [[VCVTS_U32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtzu.i32.f32(float [[A]])
// CHECK-NEXT: ret i32 [[VCVTS_U32_F32_I]]
//
uint32_t test_vcvts_u32_f32(float32_t a) {
@@ -209,7 +209,7 @@ uint32_t test_vcvts_u32_f32(float32_t a) {
// CHECK-LABEL: define {{[^@]+}}@test_vcvtd_u64_f64
// CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VCVTD_U64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtzu.i64.f64(double [[A]]) #[[ATTR2]]
+// CHECK-NEXT: [[VCVTD_U64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtzu.i64.f64(double [[A]])
// CHECK-NEXT: ret i64 [[VCVTD_U64_F64_I]]
//
uint64_t test_vcvtd_u64_f64(float64_t a) {
diff --git a/clang/test/CodeGen/aarch64-neon-fma.c b/clang/test/CodeGen/aarch64-neon-fma.c
index 3498fdab97638..7da21aaf7e9bd 100644
--- a/clang/test/CodeGen/aarch64-neon-fma.c
+++ b/clang/test/CodeGen/aarch64-neon-fma.c
@@ -292,7 +292,7 @@ float32x4_t test_vmlsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[B]] to <16 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[VECINIT1_I]] to <16 x i8>
-// CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[B]], <2 x double> [[VECINIT1_I]], <2 x double> [[A]]) #[[ATTR3:[0-9]+]]
+// CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[B]], <2 x double> [[VECINIT1_I]], <2 x double> [[A]])
// CHECK-NEXT: ret <2 x double> [[TMP3]]
//
float64x2_t test_vfmaq_n_f64(float64x2_t a, float64x2_t b, float64_t c) {
@@ -308,7 +308,7 @@ float64x2_t test_vfmaq_n_f64(float64x2_t a, float64x2_t b, float64_t c) {
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[FNEG_I]] to <16 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[VECINIT1_I]] to <16 x i8>
-// CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FNEG_I]], <2 x double> [[VECINIT1_I]], <2 x double> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FNEG_I]], <2 x double> [[VECINIT1_I]], <2 x double> [[A]])
// CHECK-NEXT: ret <2 x double> [[TMP3]]
//
float64x2_t test_vfmsq_n_f64(float64x2_t a, float64x2_t b, float64_t c) {
diff --git a/clang/test/CodeGen/aarch64-neon-fp16fml.c b/clang/test/CodeGen/aarch64-neon-fp16fml.c
index 50bb629f7d477..fd4e91eb5181a 100644
--- a/clang/test/CodeGen/aarch64-neon-fp16fml.c
+++ b/clang/test/CodeGen/aarch64-neon-fp16fml.c
@@ -15,7 +15,7 @@
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C:%.*]] to <8 x i8>
-// CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]]) #[[ATTR3:[0-9]+]]
+// CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]])
// CHECK-NEXT: ret <2 x float> [[VFMLAL_LOW3_I]]
//
float32x2_t test_vfmlal_low_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
@@ -27,7 +27,7 @@ float32x2_t test_vfmlal_low_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C:%.*]] to <8 x i8>
-// CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]]) #[[ATTR3]]
+// CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]])
// CHECK-NEXT: ret <2 x float> [[VFMLSL_LOW3_I]]
//
float32x2_t test_vfmlsl_low_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
@@ -39,7 +39,7 @@ float32x2_t test_vfmlsl_low_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C:%.*]] to <8 x i8>
-// CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]]) #[[ATTR3]]
+// CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]])
// CHECK-NEXT: ret <2 x float> [[VFMLAL_HIGH3_I]]
//
float32x2_t test_vfmlal_high_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
@@ -51,7 +51,7 @@ float32x2_t test_vfmlal_high_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C:%.*]] to <8 x i8>
-// CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]]) #[[ATTR3]]
+// CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]])
// CHECK-NEXT: ret <2 x float> [[VFMLSL_HIGH3_I]]
//
float32x2_t test_vfmlsl_high_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
@@ -63,7 +63,7 @@ float32x2_t test_vfmlsl_high_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C:%.*]] to <16 x i8>
-// CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]]) #[[ATTR3]]
+// CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]])
// CHECK-NEXT: ret <4 x float> [[VFMLAL_LOW3_I]]
//
float32x4_t test_vfmlalq_low_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
@@ -75,7 +75,7 @@ float32x4_t test_vfmlalq_low_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C:%.*]] to <16 x i8>
-// CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]]) #[[ATTR3]]
+// CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]])
// CHECK-NEXT: ret <4 x float> [[VFMLSL_LOW3_I]]
//
float32x4_t test_vfmlslq_low_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
@@ -87,7 +87,7 @@ float32x4_t test_vfmlslq_low_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C:%.*]] to <16 x i8>
-// CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]]) #[[ATTR3]]
+// CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]])
// CHECK-NEXT: ret <4 x float> [[VFMLAL_HIGH3_I]]
//
float32x4_t test_vfmlalq_high_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
@@ -99,7 +99,7 @@ float32x4_t test_vfmlalq_high_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C:%.*]] to <16 x i8>
-// CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]]) #[[ATTR3]]
+// CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]])
// CHECK-NEXT: ret <4 x float> [[VFMLSL_HIGH3_I]]
//
float32x4_t test_vfmlslq_high_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
@@ -153,7 +153,7 @@ float32x4_t test_vfmlslq_high_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
// CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP18:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8>
-// CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #[[ATTR3]]
+// CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]])
// CHECK-NEXT: ret <2 x float> [[VFMLAL_LOW3_I]]
//
float32x2_t test_vfmlal_lane_low_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
@@ -205,7 +205,7 @@ float32x2_t test_vfmlal_lane_low_f16(float32x2_t a, float16x4_t b, float16x4_t c
// CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP18:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8>
-// CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #[[ATTR3]]
+// CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]])
// CHECK-NEXT: ret <2 x float> [[VFMLAL_HIGH3_I]]
//
float32x2_t test_vfmlal_lane_high_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
@@ -297,7 +297,7 @@ float32x2_t test_vfmlal_lane_high_f16(float32x2_t a, float16x4_t b, float16x4_t
// CHECK-NEXT: [[TMP32:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP33:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP34:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8>
-// CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #[[ATTR3]]
+// CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]])
// CHECK-NEXT: ret <4 x float> [[VFMLAL_LOW3_I]]
//
float32x4_t test_vfmlalq_lane_low_f16(float32x4_t a, float16x8_t b, float16x4_t c) {
@@ -389,7 +389,7 @@ float32x4_t test_vfmlalq_lane_low_f16(float32x4_t a, float16x8_t b, float16x4_t
// CHECK-NEXT: [[TMP32:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP33:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP34:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8>
-// CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #[[ATTR3]]
+// CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]])
// CHECK-NEXT: ret <4 x float> [[VFMLAL_HIGH3_I]]
//
float32x4_t test_vfmlalq_lane_high_f16(float32x4_t a, float16x8_t b, float16x4_t c) {
@@ -441,7 +441,7 @@ float32x4_t test_vfmlalq_lane_high_f16(float32x4_t a, float16x8_t b, float16x4_t
// CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP18:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8>
-// CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #[[ATTR3]]
+// CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]])
// CHECK-NEXT: ret <2 x float> [[VFMLAL_LOW3_I]]
//
float32x2_t test_vfmlal_laneq_low_f16(float32x2_t a, float16x4_t b, float16x8_t c) {
@@ -493,7 +493,7 @@ float32x2_t test_vfmlal_laneq_low_f16(float32x2_t a, float16x4_t b, float16x8_t
// CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP18:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8>
-// CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #[[ATTR3]]
+// CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]])
// CHECK-NEXT: ret <2 x float> [[VFMLAL_HIGH3_I]]
//
float32x2_t test_vfmlal_laneq_high_f16(float32x2_t a, float16x4_t b, float16x8_t c) {
@@ -585,7 +585,7 @@ float32x2_t test_vfmlal_laneq_high_f16(float32x2_t a, float16x4_t b, float16x8_t
// CHECK-NEXT: [[TMP32:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP33:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP34:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8>
-// CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #[[ATTR3]]
+// CHECK-NEXT: [[VFMLAL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]])
// CHECK-NEXT: ret <4 x float> [[VFMLAL_LOW3_I]]
//
float32x4_t test_vfmlalq_laneq_low_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
@@ -677,7 +677,7 @@ float32x4_t test_vfmlalq_laneq_low_f16(float32x4_t a, float16x8_t b, float16x8_t
// CHECK-NEXT: [[TMP32:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP33:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP34:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8>
-// CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #[[ATTR3]]
+// CHECK-NEXT: [[VFMLAL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]])
// CHECK-NEXT: ret <4 x float> [[VFMLAL_HIGH3_I]]
//
float32x4_t test_vfmlalq_laneq_high_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
@@ -729,7 +729,7 @@ float32x4_t test_vfmlalq_laneq_high_f16(float32x4_t a, float16x8_t b, float16x8_
// CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP18:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8>
-// CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #[[ATTR3]]
+// CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]])
// CHECK-NEXT: ret <2 x float> [[VFMLSL_LOW3_I]]
//
float32x2_t test_vfmlsl_lane_low_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
@@ -781,7 +781,7 @@ float32x2_t test_vfmlsl_lane_low_f16(float32x2_t a, float16x4_t b, float16x4_t c
// CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP18:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8>
-// CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #[[ATTR3]]
+// CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]])
// CHECK-NEXT: ret <2 x float> [[VFMLSL_HIGH3_I]]
//
float32x2_t test_vfmlsl_lane_high_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
@@ -873,7 +873,7 @@ float32x2_t test_vfmlsl_lane_high_f16(float32x2_t a, float16x4_t b, float16x4_t
// CHECK-NEXT: [[TMP32:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP33:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP34:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8>
-// CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #[[ATTR3]]
+// CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]])
// CHECK-NEXT: ret <4 x float> [[VFMLSL_LOW3_I]]
//
float32x4_t test_vfmlslq_lane_low_f16(float32x4_t a, float16x8_t b, float16x4_t c) {
@@ -965,7 +965,7 @@ float32x4_t test_vfmlslq_lane_low_f16(float32x4_t a, float16x8_t b, float16x4_t
// CHECK-NEXT: [[TMP32:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP33:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP34:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8>
-// CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #[[ATTR3]]
+// CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]])
// CHECK-NEXT: ret <4 x float> [[VFMLSL_HIGH3_I]]
//
float32x4_t test_vfmlslq_lane_high_f16(float32x4_t a, float16x8_t b, float16x4_t c) {
@@ -1017,7 +1017,7 @@ float32x4_t test_vfmlslq_lane_high_f16(float32x4_t a, float16x8_t b, float16x4_t
// CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP18:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8>
-// CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #[[ATTR3]]
+// CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]])
// CHECK-NEXT: ret <2 x float> [[VFMLSL_LOW3_I]]
//
float32x2_t test_vfmlsl_laneq_low_f16(float32x2_t a, float16x4_t b, float16x8_t c) {
@@ -1069,7 +1069,7 @@ float32x2_t test_vfmlsl_laneq_low_f16(float32x2_t a, float16x4_t b, float16x8_t
// CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP17:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP18:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8>
-// CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #[[ATTR3]]
+// CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]])
// CHECK-NEXT: ret <2 x float> [[VFMLSL_HIGH3_I]]
//
float32x2_t test_vfmlsl_laneq_high_f16(float32x2_t a, float16x4_t b, float16x8_t c) {
@@ -1161,7 +1161,7 @@ float32x2_t test_vfmlsl_laneq_high_f16(float32x2_t a, float16x4_t b, float16x8_t
// CHECK-NEXT: [[TMP32:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP33:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP34:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8>
-// CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #[[ATTR3]]
+// CHECK-NEXT: [[VFMLSL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]])
// CHECK-NEXT: ret <4 x float> [[VFMLSL_LOW3_I]]
//
float32x4_t test_vfmlslq_laneq_low_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
@@ -1253,7 +1253,7 @@ float32x4_t test_vfmlslq_laneq_low_f16(float32x4_t a, float16x8_t b, float16x8_t
// CHECK-NEXT: [[TMP32:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP33:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP34:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8>
-// CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #[[ATTR3]]
+// CHECK-NEXT: [[VFMLSL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]])
// CHECK-NEXT: ret <4 x float> [[VFMLSL_HIGH3_I]]
//
float32x4_t test_vfmlslq_laneq_high_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
diff --git a/clang/test/CodeGen/aarch64-neon-tbl.c b/clang/test/CodeGen/aarch64-neon-tbl.c
index 71ac7d425c451..8d0ece0083cbe 100644
--- a/clang/test/CodeGen/aarch64-neon-tbl.c
+++ b/clang/test/CodeGen/aarch64-neon-tbl.c
@@ -10,7 +10,7 @@
// CHECK-SAME: (<8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VTBL1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT: [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> [[B]]) #[[ATTR3:[0-9]+]]
+// CHECK-NEXT: [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> [[B]])
// CHECK-NEXT: ret <8 x i8> [[VTBL11_I]]
//
int8x8_t test_vtbl1_s8(int8x8_t a, int8x8_t b) {
@@ -20,7 +20,7 @@ int8x8_t test_vtbl1_s8(int8x8_t a, int8x8_t b) {
// CHECK-LABEL: define {{[^@]+}}@test_vqtbl1_s8
// CHECK-SAME: (<16 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR1:[0-9]+]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VTBL1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[A]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBL1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[A]], <8 x i8> [[B]])
// CHECK-NEXT: ret <8 x i8> [[VTBL1_I]]
//
int8x8_t test_vqtbl1_s8(int8x16_t a, uint8x8_t b) {
@@ -45,7 +45,7 @@ int8x8_t test_vqtbl1_s8(int8x16_t a, uint8x8_t b) {
// CHECK-NEXT: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
// CHECK-NEXT: [[VTBL1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT: [[VTBL13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBL13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> [[B]])
// CHECK-NEXT: ret <8 x i8> [[VTBL13_I]]
//
int8x8_t test_vtbl2_s8(int8x8x2_t a, int8x8_t b) {
@@ -69,7 +69,7 @@ int8x8_t test_vtbl2_s8(int8x8x2_t a, int8x8_t b) {
// CHECK-NEXT: [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X2_T]], %struct.int8x16x2_t* [[__P0_I]], i32 0, i32 0
// CHECK-NEXT: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
// CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK-NEXT: [[VTBL2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBL2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> [[B]])
// CHECK-NEXT: ret <8 x i8> [[VTBL2_I]]
//
int8x8_t test_vqtbl2_s8(int8x16x2_t a, uint8x8_t b) {
@@ -98,7 +98,7 @@ int8x8_t test_vqtbl2_s8(int8x16x2_t a, uint8x8_t b) {
// CHECK-NEXT: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
// CHECK-NEXT: [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
// CHECK-NEXT: [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT: [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> [[B]])
// CHECK-NEXT: ret <8 x i8> [[VTBL26_I]]
//
int8x8_t test_vtbl3_s8(int8x8x3_t a, int8x8_t b) {
@@ -125,7 +125,7 @@ int8x8_t test_vtbl3_s8(int8x8x3_t a, int8x8_t b) {
// CHECK-NEXT: [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X3_T]], %struct.int8x16x3_t* [[__P0_I]], i32 0, i32 0
// CHECK-NEXT: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
// CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
-// CHECK-NEXT: [[VTBL3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBL3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> [[B]])
// CHECK-NEXT: ret <8 x i8> [[VTBL3_I]]
//
int8x8_t test_vqtbl3_s8(int8x16x3_t a, uint8x8_t b) {
@@ -157,7 +157,7 @@ int8x8_t test_vqtbl3_s8(int8x16x3_t a, uint8x8_t b) {
// CHECK-NEXT: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
// CHECK-NEXT: [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
// CHECK-NEXT: [[VTBL27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT: [[VTBL28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL27_I]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBL28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL27_I]], <8 x i8> [[B]])
// CHECK-NEXT: ret <8 x i8> [[VTBL28_I]]
//
int8x8_t test_vtbl4_s8(int8x8x4_t a, int8x8_t b) {
@@ -187,7 +187,7 @@ int8x8_t test_vtbl4_s8(int8x8x4_t a, int8x8_t b) {
// CHECK-NEXT: [[VAL5_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0
// CHECK-NEXT: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
// CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
-// CHECK-NEXT: [[VTBL4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBL4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> [[B]])
// CHECK-NEXT: ret <8 x i8> [[VTBL4_I]]
//
int8x8_t test_vqtbl4_s8(int8x16x4_t a, uint8x8_t b) {
@@ -197,7 +197,7 @@ int8x8_t test_vqtbl4_s8(int8x16x4_t a, uint8x8_t b) {
// CHECK-LABEL: define {{[^@]+}}@test_vqtbl1q_s8
// CHECK-SAME: (<16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VTBL1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBL1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
// CHECK-NEXT: ret <16 x i8> [[VTBL1_I]]
//
int8x16_t test_vqtbl1q_s8(int8x16_t a, int8x16_t b) {
@@ -221,7 +221,7 @@ int8x16_t test_vqtbl1q_s8(int8x16_t a, int8x16_t b) {
// CHECK-NEXT: [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X2_T]], %struct.int8x16x2_t* [[__P0_I]], i32 0, i32 0
// CHECK-NEXT: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
// CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK-NEXT: [[VTBL2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBL2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[B]])
// CHECK-NEXT: ret <16 x i8> [[VTBL2_I]]
//
int8x16_t test_vqtbl2q_s8(int8x16x2_t a, int8x16_t b) {
@@ -248,7 +248,7 @@ int8x16_t test_vqtbl2q_s8(int8x16x2_t a, int8x16_t b) {
// CHECK-NEXT: [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X3_T]], %struct.int8x16x3_t* [[__P0_I]], i32 0, i32 0
// CHECK-NEXT: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
// CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
-// CHECK-NEXT: [[VTBL3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBL3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[B]])
// CHECK-NEXT: ret <16 x i8> [[VTBL3_I]]
//
int8x16_t test_vqtbl3q_s8(int8x16x3_t a, int8x16_t b) {
@@ -278,7 +278,7 @@ int8x16_t test_vqtbl3q_s8(int8x16x3_t a, int8x16_t b) {
// CHECK-NEXT: [[VAL5_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0
// CHECK-NEXT: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
// CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
-// CHECK-NEXT: [[VTBL4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBL4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[B]])
// CHECK-NEXT: ret <16 x i8> [[VTBL4_I]]
//
int8x16_t test_vqtbl4q_s8(int8x16x4_t a, int8x16_t b) {
@@ -289,7 +289,7 @@ int8x16_t test_vqtbl4q_s8(int8x16x4_t a, int8x16_t b) {
// CHECK-SAME: (<8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VTBL1_I:%.*]] = shufflevector <8 x i8> [[B]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT: [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> [[C]])
// CHECK-NEXT: [[TMP0:%.*]] = icmp uge <8 x i8> [[C]], <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
// CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = and <8 x i8> [[TMP1]], [[A]]
@@ -320,7 +320,7 @@ int8x8_t test_vtbx1_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
// CHECK-NEXT: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
// CHECK-NEXT: [[VTBX1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT: [[VTBX13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[A]], <16 x i8> [[VTBX1_I]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBX13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[A]], <16 x i8> [[VTBX1_I]], <8 x i8> [[C]])
// CHECK-NEXT: ret <8 x i8> [[VTBX13_I]]
//
int8x8_t test_vtbx2_s8(int8x8_t a, int8x8x2_t b, int8x8_t c) {
@@ -349,7 +349,7 @@ int8x8_t test_vtbx2_s8(int8x8_t a, int8x8x2_t b, int8x8_t c) {
// CHECK-NEXT: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
// CHECK-NEXT: [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
// CHECK-NEXT: [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT: [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> [[C]])
// CHECK-NEXT: [[TMP4:%.*]] = icmp uge <8 x i8> [[C]], <i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24>
// CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i8>
// CHECK-NEXT: [[TMP6:%.*]] = and <8 x i8> [[TMP5]], [[A]]
@@ -387,7 +387,7 @@ int8x8_t test_vtbx3_s8(int8x8_t a, int8x8x3_t b, int8x8_t c) {
// CHECK-NEXT: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
// CHECK-NEXT: [[VTBX2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
// CHECK-NEXT: [[VTBX27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT: [[VTBX28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> [[A]], <16 x i8> [[VTBX2_I]], <16 x i8> [[VTBX27_I]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBX28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> [[A]], <16 x i8> [[VTBX2_I]], <16 x i8> [[VTBX27_I]], <8 x i8> [[C]])
// CHECK-NEXT: ret <8 x i8> [[VTBX28_I]]
//
int8x8_t test_vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c) {
@@ -397,7 +397,7 @@ int8x8_t test_vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c) {
// CHECK-LABEL: define {{[^@]+}}@test_vqtbx1_s8
// CHECK-SAME: (<8 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VTBX1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[A]], <16 x i8> [[B]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBX1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[A]], <16 x i8> [[B]], <8 x i8> [[C]])
// CHECK-NEXT: ret <8 x i8> [[VTBX1_I]]
//
int8x8_t test_vqtbx1_s8(int8x8_t a, int8x16_t b, uint8x8_t c) {
@@ -421,7 +421,7 @@ int8x8_t test_vqtbx1_s8(int8x8_t a, int8x16_t b, uint8x8_t c) {
// CHECK-NEXT: [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X2_T]], %struct.int8x16x2_t* [[__P1_I]], i32 0, i32 0
// CHECK-NEXT: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
// CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK-NEXT: [[VTBX2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBX2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> [[C]])
// CHECK-NEXT: ret <8 x i8> [[VTBX2_I]]
//
int8x8_t test_vqtbx2_s8(int8x8_t a, int8x16x2_t b, uint8x8_t c) {
@@ -448,7 +448,7 @@ int8x8_t test_vqtbx2_s8(int8x8_t a, int8x16x2_t b, uint8x8_t c) {
// CHECK-NEXT: [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X3_T]], %struct.int8x16x3_t* [[__P1_I]], i32 0, i32 0
// CHECK-NEXT: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
// CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
-// CHECK-NEXT: [[VTBX3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBX3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> [[C]])
// CHECK-NEXT: ret <8 x i8> [[VTBX3_I]]
//
int8x8_t test_vqtbx3_s8(int8x8_t a, int8x16x3_t b, uint8x8_t c) {
@@ -478,7 +478,7 @@ int8x8_t test_vqtbx3_s8(int8x8_t a, int8x16x3_t b, uint8x8_t c) {
// CHECK-NEXT: [[VAL5_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0
// CHECK-NEXT: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
// CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
-// CHECK-NEXT: [[VTBX4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBX4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> [[C]])
// CHECK-NEXT: ret <8 x i8> [[VTBX4_I]]
//
int8x8_t test_vqtbx4_s8(int8x8_t a, int8x16x4_t b, uint8x8_t c) {
@@ -488,7 +488,7 @@ int8x8_t test_vqtbx4_s8(int8x8_t a, int8x16x4_t b, uint8x8_t c) {
// CHECK-LABEL: define {{[^@]+}}@test_vqtbx1q_s8
// CHECK-SAME: (<16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VTBX1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBX1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]])
// CHECK-NEXT: ret <16 x i8> [[VTBX1_I]]
//
int8x16_t test_vqtbx1q_s8(int8x16_t a, int8x16_t b, uint8x16_t c) {
@@ -512,7 +512,7 @@ int8x16_t test_vqtbx1q_s8(int8x16_t a, int8x16_t b, uint8x16_t c) {
// CHECK-NEXT: [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X2_T]], %struct.int8x16x2_t* [[__P1_I]], i32 0, i32 0
// CHECK-NEXT: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
// CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK-NEXT: [[VTBX2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBX2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[C]])
// CHECK-NEXT: ret <16 x i8> [[VTBX2_I]]
//
int8x16_t test_vqtbx2q_s8(int8x16_t a, int8x16x2_t b, int8x16_t c) {
@@ -539,7 +539,7 @@ int8x16_t test_vqtbx2q_s8(int8x16_t a, int8x16x2_t b, int8x16_t c) {
// CHECK-NEXT: [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X3_T]], %struct.int8x16x3_t* [[__P1_I]], i32 0, i32 0
// CHECK-NEXT: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
// CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
-// CHECK-NEXT: [[VTBX3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBX3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[C]])
// CHECK-NEXT: ret <16 x i8> [[VTBX3_I]]
//
int8x16_t test_vqtbx3q_s8(int8x16_t a, int8x16x3_t b, int8x16_t c) {
@@ -569,7 +569,7 @@ int8x16_t test_vqtbx3q_s8(int8x16_t a, int8x16x3_t b, int8x16_t c) {
// CHECK-NEXT: [[VAL5_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0
// CHECK-NEXT: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
// CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
-// CHECK-NEXT: [[VTBX4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBX4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[C]])
// CHECK-NEXT: ret <16 x i8> [[VTBX4_I]]
//
int8x16_t test_vqtbx4q_s8(int8x16_t a, int8x16x4_t b, int8x16_t c) {
@@ -580,7 +580,7 @@ int8x16_t test_vqtbx4q_s8(int8x16_t a, int8x16x4_t b, int8x16_t c) {
// CHECK-SAME: (<8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VTBL1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT: [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> [[B]])
// CHECK-NEXT: ret <8 x i8> [[VTBL11_I]]
//
uint8x8_t test_vtbl1_u8(uint8x8_t a, uint8x8_t b) {
@@ -590,7 +590,7 @@ uint8x8_t test_vtbl1_u8(uint8x8_t a, uint8x8_t b) {
// CHECK-LABEL: define {{[^@]+}}@test_vqtbl1_u8
// CHECK-SAME: (<16 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VTBL1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[A]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBL1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[A]], <8 x i8> [[B]])
// CHECK-NEXT: ret <8 x i8> [[VTBL1_I]]
//
uint8x8_t test_vqtbl1_u8(uint8x16_t a, uint8x8_t b) {
@@ -615,7 +615,7 @@ uint8x8_t test_vqtbl1_u8(uint8x16_t a, uint8x8_t b) {
// CHECK-NEXT: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
// CHECK-NEXT: [[VTBL1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT: [[VTBL13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBL13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> [[B]])
// CHECK-NEXT: ret <8 x i8> [[VTBL13_I]]
//
uint8x8_t test_vtbl2_u8(uint8x8x2_t a, uint8x8_t b) {
@@ -639,7 +639,7 @@ uint8x8_t test_vtbl2_u8(uint8x8x2_t a, uint8x8_t b) {
// CHECK-NEXT: [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X2_T]], %struct.uint8x16x2_t* [[__P0_I]], i32 0, i32 0
// CHECK-NEXT: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
// CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK-NEXT: [[VTBL2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBL2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> [[B]])
// CHECK-NEXT: ret <8 x i8> [[VTBL2_I]]
//
uint8x8_t test_vqtbl2_u8(uint8x16x2_t a, uint8x8_t b) {
@@ -668,7 +668,7 @@ uint8x8_t test_vqtbl2_u8(uint8x16x2_t a, uint8x8_t b) {
// CHECK-NEXT: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
// CHECK-NEXT: [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
// CHECK-NEXT: [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT: [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> [[B]])
// CHECK-NEXT: ret <8 x i8> [[VTBL26_I]]
//
uint8x8_t test_vtbl3_u8(uint8x8x3_t a, uint8x8_t b) {
@@ -695,7 +695,7 @@ uint8x8_t test_vtbl3_u8(uint8x8x3_t a, uint8x8_t b) {
// CHECK-NEXT: [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X3_T]], %struct.uint8x16x3_t* [[__P0_I]], i32 0, i32 0
// CHECK-NEXT: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
// CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
-// CHECK-NEXT: [[VTBL3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBL3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> [[B]])
// CHECK-NEXT: ret <8 x i8> [[VTBL3_I]]
//
uint8x8_t test_vqtbl3_u8(uint8x16x3_t a, uint8x8_t b) {
@@ -727,7 +727,7 @@ uint8x8_t test_vqtbl3_u8(uint8x16x3_t a, uint8x8_t b) {
// CHECK-NEXT: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
// CHECK-NEXT: [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
// CHECK-NEXT: [[VTBL27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT: [[VTBL28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL27_I]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBL28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL27_I]], <8 x i8> [[B]])
// CHECK-NEXT: ret <8 x i8> [[VTBL28_I]]
//
uint8x8_t test_vtbl4_u8(uint8x8x4_t a, uint8x8_t b) {
@@ -757,7 +757,7 @@ uint8x8_t test_vtbl4_u8(uint8x8x4_t a, uint8x8_t b) {
// CHECK-NEXT: [[VAL5_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0
// CHECK-NEXT: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
// CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
-// CHECK-NEXT: [[VTBL4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBL4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> [[B]])
// CHECK-NEXT: ret <8 x i8> [[VTBL4_I]]
//
uint8x8_t test_vqtbl4_u8(uint8x16x4_t a, uint8x8_t b) {
@@ -767,7 +767,7 @@ uint8x8_t test_vqtbl4_u8(uint8x16x4_t a, uint8x8_t b) {
// CHECK-LABEL: define {{[^@]+}}@test_vqtbl1q_u8
// CHECK-SAME: (<16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VTBL1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBL1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
// CHECK-NEXT: ret <16 x i8> [[VTBL1_I]]
//
uint8x16_t test_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
@@ -791,7 +791,7 @@ uint8x16_t test_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
// CHECK-NEXT: [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X2_T]], %struct.uint8x16x2_t* [[__P0_I]], i32 0, i32 0
// CHECK-NEXT: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
// CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK-NEXT: [[VTBL2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBL2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[B]])
// CHECK-NEXT: ret <16 x i8> [[VTBL2_I]]
//
uint8x16_t test_vqtbl2q_u8(uint8x16x2_t a, uint8x16_t b) {
@@ -818,7 +818,7 @@ uint8x16_t test_vqtbl2q_u8(uint8x16x2_t a, uint8x16_t b) {
// CHECK-NEXT: [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X3_T]], %struct.uint8x16x3_t* [[__P0_I]], i32 0, i32 0
// CHECK-NEXT: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
// CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
-// CHECK-NEXT: [[VTBL3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBL3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[B]])
// CHECK-NEXT: ret <16 x i8> [[VTBL3_I]]
//
uint8x16_t test_vqtbl3q_u8(uint8x16x3_t a, uint8x16_t b) {
@@ -848,7 +848,7 @@ uint8x16_t test_vqtbl3q_u8(uint8x16x3_t a, uint8x16_t b) {
// CHECK-NEXT: [[VAL5_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0
// CHECK-NEXT: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
// CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
-// CHECK-NEXT: [[VTBL4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBL4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[B]])
// CHECK-NEXT: ret <16 x i8> [[VTBL4_I]]
//
uint8x16_t test_vqtbl4q_u8(uint8x16x4_t a, uint8x16_t b) {
@@ -859,7 +859,7 @@ uint8x16_t test_vqtbl4q_u8(uint8x16x4_t a, uint8x16_t b) {
// CHECK-SAME: (<8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VTBL1_I:%.*]] = shufflevector <8 x i8> [[B]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT: [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> [[C]])
// CHECK-NEXT: [[TMP0:%.*]] = icmp uge <8 x i8> [[C]], <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
// CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = and <8 x i8> [[TMP1]], [[A]]
@@ -890,7 +890,7 @@ uint8x8_t test_vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
// CHECK-NEXT: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
// CHECK-NEXT: [[VTBX1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT: [[VTBX13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[A]], <16 x i8> [[VTBX1_I]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBX13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[A]], <16 x i8> [[VTBX1_I]], <8 x i8> [[C]])
// CHECK-NEXT: ret <8 x i8> [[VTBX13_I]]
//
uint8x8_t test_vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c) {
@@ -919,7 +919,7 @@ uint8x8_t test_vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c) {
// CHECK-NEXT: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
// CHECK-NEXT: [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
// CHECK-NEXT: [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT: [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> [[C]])
// CHECK-NEXT: [[TMP4:%.*]] = icmp uge <8 x i8> [[C]], <i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24>
// CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i8>
// CHECK-NEXT: [[TMP6:%.*]] = and <8 x i8> [[TMP5]], [[A]]
@@ -957,7 +957,7 @@ uint8x8_t test_vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c) {
// CHECK-NEXT: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
// CHECK-NEXT: [[VTBX2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
// CHECK-NEXT: [[VTBX27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT: [[VTBX28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> [[A]], <16 x i8> [[VTBX2_I]], <16 x i8> [[VTBX27_I]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBX28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> [[A]], <16 x i8> [[VTBX2_I]], <16 x i8> [[VTBX27_I]], <8 x i8> [[C]])
// CHECK-NEXT: ret <8 x i8> [[VTBX28_I]]
//
uint8x8_t test_vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c) {
@@ -967,7 +967,7 @@ uint8x8_t test_vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c) {
// CHECK-LABEL: define {{[^@]+}}@test_vqtbx1_u8
// CHECK-SAME: (<8 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VTBX1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[A]], <16 x i8> [[B]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBX1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[A]], <16 x i8> [[B]], <8 x i8> [[C]])
// CHECK-NEXT: ret <8 x i8> [[VTBX1_I]]
//
uint8x8_t test_vqtbx1_u8(uint8x8_t a, uint8x16_t b, uint8x8_t c) {
@@ -991,7 +991,7 @@ uint8x8_t test_vqtbx1_u8(uint8x8_t a, uint8x16_t b, uint8x8_t c) {
// CHECK-NEXT: [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X2_T]], %struct.uint8x16x2_t* [[__P1_I]], i32 0, i32 0
// CHECK-NEXT: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
// CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK-NEXT: [[VTBX2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBX2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> [[C]])
// CHECK-NEXT: ret <8 x i8> [[VTBX2_I]]
//
uint8x8_t test_vqtbx2_u8(uint8x8_t a, uint8x16x2_t b, uint8x8_t c) {
@@ -1018,7 +1018,7 @@ uint8x8_t test_vqtbx2_u8(uint8x8_t a, uint8x16x2_t b, uint8x8_t c) {
// CHECK-NEXT: [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X3_T]], %struct.uint8x16x3_t* [[__P1_I]], i32 0, i32 0
// CHECK-NEXT: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
// CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
-// CHECK-NEXT: [[VTBX3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBX3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> [[C]])
// CHECK-NEXT: ret <8 x i8> [[VTBX3_I]]
//
uint8x8_t test_vqtbx3_u8(uint8x8_t a, uint8x16x3_t b, uint8x8_t c) {
@@ -1048,7 +1048,7 @@ uint8x8_t test_vqtbx3_u8(uint8x8_t a, uint8x16x3_t b, uint8x8_t c) {
// CHECK-NEXT: [[VAL5_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0
// CHECK-NEXT: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
// CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
-// CHECK-NEXT: [[VTBX4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBX4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> [[C]])
// CHECK-NEXT: ret <8 x i8> [[VTBX4_I]]
//
uint8x8_t test_vqtbx4_u8(uint8x8_t a, uint8x16x4_t b, uint8x8_t c) {
@@ -1058,7 +1058,7 @@ uint8x8_t test_vqtbx4_u8(uint8x8_t a, uint8x16x4_t b, uint8x8_t c) {
// CHECK-LABEL: define {{[^@]+}}@test_vqtbx1q_u8
// CHECK-SAME: (<16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VTBX1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBX1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]])
// CHECK-NEXT: ret <16 x i8> [[VTBX1_I]]
//
uint8x16_t test_vqtbx1q_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
@@ -1082,7 +1082,7 @@ uint8x16_t test_vqtbx1q_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
// CHECK-NEXT: [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X2_T]], %struct.uint8x16x2_t* [[__P1_I]], i32 0, i32 0
// CHECK-NEXT: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
// CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK-NEXT: [[VTBX2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBX2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[C]])
// CHECK-NEXT: ret <16 x i8> [[VTBX2_I]]
//
uint8x16_t test_vqtbx2q_u8(uint8x16_t a, uint8x16x2_t b, uint8x16_t c) {
@@ -1109,7 +1109,7 @@ uint8x16_t test_vqtbx2q_u8(uint8x16_t a, uint8x16x2_t b, uint8x16_t c) {
// CHECK-NEXT: [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X3_T]], %struct.uint8x16x3_t* [[__P1_I]], i32 0, i32 0
// CHECK-NEXT: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
// CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
-// CHECK-NEXT: [[VTBX3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBX3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[C]])
// CHECK-NEXT: ret <16 x i8> [[VTBX3_I]]
//
uint8x16_t test_vqtbx3q_u8(uint8x16_t a, uint8x16x3_t b, uint8x16_t c) {
@@ -1139,7 +1139,7 @@ uint8x16_t test_vqtbx3q_u8(uint8x16_t a, uint8x16x3_t b, uint8x16_t c) {
// CHECK-NEXT: [[VAL5_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0
// CHECK-NEXT: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
// CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
-// CHECK-NEXT: [[VTBX4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBX4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[C]])
// CHECK-NEXT: ret <16 x i8> [[VTBX4_I]]
//
uint8x16_t test_vqtbx4q_u8(uint8x16_t a, uint8x16x4_t b, uint8x16_t c) {
@@ -1150,7 +1150,7 @@ uint8x16_t test_vqtbx4q_u8(uint8x16_t a, uint8x16x4_t b, uint8x16_t c) {
// CHECK-SAME: (<8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VTBL1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT: [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> [[B]])
// CHECK-NEXT: ret <8 x i8> [[VTBL11_I]]
//
poly8x8_t test_vtbl1_p8(poly8x8_t a, uint8x8_t b) {
@@ -1160,7 +1160,7 @@ poly8x8_t test_vtbl1_p8(poly8x8_t a, uint8x8_t b) {
// CHECK-LABEL: define {{[^@]+}}@test_vqtbl1_p8
// CHECK-SAME: (<16 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VTBL1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[A]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBL1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[A]], <8 x i8> [[B]])
// CHECK-NEXT: ret <8 x i8> [[VTBL1_I]]
//
poly8x8_t test_vqtbl1_p8(poly8x16_t a, uint8x8_t b) {
@@ -1185,7 +1185,7 @@ poly8x8_t test_vqtbl1_p8(poly8x16_t a, uint8x8_t b) {
// CHECK-NEXT: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
// CHECK-NEXT: [[VTBL1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT: [[VTBL13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBL13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> [[B]])
// CHECK-NEXT: ret <8 x i8> [[VTBL13_I]]
//
poly8x8_t test_vtbl2_p8(poly8x8x2_t a, uint8x8_t b) {
@@ -1209,7 +1209,7 @@ poly8x8_t test_vtbl2_p8(poly8x8x2_t a, uint8x8_t b) {
// CHECK-NEXT: [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X2_T]], %struct.poly8x16x2_t* [[__P0_I]], i32 0, i32 0
// CHECK-NEXT: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
// CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK-NEXT: [[VTBL2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBL2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> [[B]])
// CHECK-NEXT: ret <8 x i8> [[VTBL2_I]]
//
poly8x8_t test_vqtbl2_p8(poly8x16x2_t a, uint8x8_t b) {
@@ -1238,7 +1238,7 @@ poly8x8_t test_vqtbl2_p8(poly8x16x2_t a, uint8x8_t b) {
// CHECK-NEXT: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
// CHECK-NEXT: [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
// CHECK-NEXT: [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT: [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> [[B]])
// CHECK-NEXT: ret <8 x i8> [[VTBL26_I]]
//
poly8x8_t test_vtbl3_p8(poly8x8x3_t a, uint8x8_t b) {
@@ -1265,7 +1265,7 @@ poly8x8_t test_vtbl3_p8(poly8x8x3_t a, uint8x8_t b) {
// CHECK-NEXT: [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X3_T]], %struct.poly8x16x3_t* [[__P0_I]], i32 0, i32 0
// CHECK-NEXT: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
// CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
-// CHECK-NEXT: [[VTBL3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBL3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> [[B]])
// CHECK-NEXT: ret <8 x i8> [[VTBL3_I]]
//
poly8x8_t test_vqtbl3_p8(poly8x16x3_t a, uint8x8_t b) {
@@ -1297,7 +1297,7 @@ poly8x8_t test_vqtbl3_p8(poly8x16x3_t a, uint8x8_t b) {
// CHECK-NEXT: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
// CHECK-NEXT: [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
// CHECK-NEXT: [[VTBL27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT: [[VTBL28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL27_I]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBL28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL27_I]], <8 x i8> [[B]])
// CHECK-NEXT: ret <8 x i8> [[VTBL28_I]]
//
poly8x8_t test_vtbl4_p8(poly8x8x4_t a, uint8x8_t b) {
@@ -1327,7 +1327,7 @@ poly8x8_t test_vtbl4_p8(poly8x8x4_t a, uint8x8_t b) {
// CHECK-NEXT: [[VAL5_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0
// CHECK-NEXT: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
// CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
-// CHECK-NEXT: [[VTBL4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBL4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> [[B]])
// CHECK-NEXT: ret <8 x i8> [[VTBL4_I]]
//
poly8x8_t test_vqtbl4_p8(poly8x16x4_t a, uint8x8_t b) {
@@ -1337,7 +1337,7 @@ poly8x8_t test_vqtbl4_p8(poly8x16x4_t a, uint8x8_t b) {
// CHECK-LABEL: define {{[^@]+}}@test_vqtbl1q_p8
// CHECK-SAME: (<16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VTBL1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBL1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
// CHECK-NEXT: ret <16 x i8> [[VTBL1_I]]
//
poly8x16_t test_vqtbl1q_p8(poly8x16_t a, uint8x16_t b) {
@@ -1361,7 +1361,7 @@ poly8x16_t test_vqtbl1q_p8(poly8x16_t a, uint8x16_t b) {
// CHECK-NEXT: [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X2_T]], %struct.poly8x16x2_t* [[__P0_I]], i32 0, i32 0
// CHECK-NEXT: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
// CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK-NEXT: [[VTBL2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBL2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[B]])
// CHECK-NEXT: ret <16 x i8> [[VTBL2_I]]
//
poly8x16_t test_vqtbl2q_p8(poly8x16x2_t a, uint8x16_t b) {
@@ -1388,7 +1388,7 @@ poly8x16_t test_vqtbl2q_p8(poly8x16x2_t a, uint8x16_t b) {
// CHECK-NEXT: [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X3_T]], %struct.poly8x16x3_t* [[__P0_I]], i32 0, i32 0
// CHECK-NEXT: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
// CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
-// CHECK-NEXT: [[VTBL3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBL3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[B]])
// CHECK-NEXT: ret <16 x i8> [[VTBL3_I]]
//
poly8x16_t test_vqtbl3q_p8(poly8x16x3_t a, uint8x16_t b) {
@@ -1418,7 +1418,7 @@ poly8x16_t test_vqtbl3q_p8(poly8x16x3_t a, uint8x16_t b) {
// CHECK-NEXT: [[VAL5_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0
// CHECK-NEXT: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
// CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
-// CHECK-NEXT: [[VTBL4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBL4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[B]])
// CHECK-NEXT: ret <16 x i8> [[VTBL4_I]]
//
poly8x16_t test_vqtbl4q_p8(poly8x16x4_t a, uint8x16_t b) {
@@ -1429,7 +1429,7 @@ poly8x16_t test_vqtbl4q_p8(poly8x16x4_t a, uint8x16_t b) {
// CHECK-SAME: (<8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VTBL1_I:%.*]] = shufflevector <8 x i8> [[B]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT: [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> [[C]])
// CHECK-NEXT: [[TMP0:%.*]] = icmp uge <8 x i8> [[C]], <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
// CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = and <8 x i8> [[TMP1]], [[A]]
@@ -1460,7 +1460,7 @@ poly8x8_t test_vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c) {
// CHECK-NEXT: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
// CHECK-NEXT: [[VTBX1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT: [[VTBX13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[A]], <16 x i8> [[VTBX1_I]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBX13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[A]], <16 x i8> [[VTBX1_I]], <8 x i8> [[C]])
// CHECK-NEXT: ret <8 x i8> [[VTBX13_I]]
//
poly8x8_t test_vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c) {
@@ -1489,7 +1489,7 @@ poly8x8_t test_vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c) {
// CHECK-NEXT: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
// CHECK-NEXT: [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
// CHECK-NEXT: [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT: [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> [[C]])
// CHECK-NEXT: [[TMP4:%.*]] = icmp uge <8 x i8> [[C]], <i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24>
// CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i8>
// CHECK-NEXT: [[TMP6:%.*]] = and <8 x i8> [[TMP5]], [[A]]
@@ -1527,7 +1527,7 @@ poly8x8_t test_vtbx3_p8(poly8x8_t a, poly8x8x3_t b, uint8x8_t c) {
// CHECK-NEXT: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
// CHECK-NEXT: [[VTBX2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
// CHECK-NEXT: [[VTBX27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT: [[VTBX28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> [[A]], <16 x i8> [[VTBX2_I]], <16 x i8> [[VTBX27_I]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBX28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> [[A]], <16 x i8> [[VTBX2_I]], <16 x i8> [[VTBX27_I]], <8 x i8> [[C]])
// CHECK-NEXT: ret <8 x i8> [[VTBX28_I]]
//
poly8x8_t test_vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c) {
@@ -1537,7 +1537,7 @@ poly8x8_t test_vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c) {
// CHECK-LABEL: define {{[^@]+}}@test_vqtbx1_p8
// CHECK-SAME: (<8 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VTBX1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[A]], <16 x i8> [[B]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBX1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[A]], <16 x i8> [[B]], <8 x i8> [[C]])
// CHECK-NEXT: ret <8 x i8> [[VTBX1_I]]
//
poly8x8_t test_vqtbx1_p8(poly8x8_t a, uint8x16_t b, uint8x8_t c) {
@@ -1561,7 +1561,7 @@ poly8x8_t test_vqtbx1_p8(poly8x8_t a, uint8x16_t b, uint8x8_t c) {
// CHECK-NEXT: [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X2_T]], %struct.poly8x16x2_t* [[__P1_I]], i32 0, i32 0
// CHECK-NEXT: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
// CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK-NEXT: [[VTBX2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBX2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> [[C]])
// CHECK-NEXT: ret <8 x i8> [[VTBX2_I]]
//
poly8x8_t test_vqtbx2_p8(poly8x8_t a, poly8x16x2_t b, uint8x8_t c) {
@@ -1588,7 +1588,7 @@ poly8x8_t test_vqtbx2_p8(poly8x8_t a, poly8x16x2_t b, uint8x8_t c) {
// CHECK-NEXT: [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X3_T]], %struct.poly8x16x3_t* [[__P1_I]], i32 0, i32 0
// CHECK-NEXT: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
// CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
-// CHECK-NEXT: [[VTBX3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBX3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> [[C]])
// CHECK-NEXT: ret <8 x i8> [[VTBX3_I]]
//
poly8x8_t test_vqtbx3_p8(poly8x8_t a, poly8x16x3_t b, uint8x8_t c) {
@@ -1618,7 +1618,7 @@ poly8x8_t test_vqtbx3_p8(poly8x8_t a, poly8x16x3_t b, uint8x8_t c) {
// CHECK-NEXT: [[VAL5_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0
// CHECK-NEXT: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
// CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
-// CHECK-NEXT: [[VTBX4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBX4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> [[C]])
// CHECK-NEXT: ret <8 x i8> [[VTBX4_I]]
//
poly8x8_t test_vqtbx4_p8(poly8x8_t a, poly8x16x4_t b, uint8x8_t c) {
@@ -1628,7 +1628,7 @@ poly8x8_t test_vqtbx4_p8(poly8x8_t a, poly8x16x4_t b, uint8x8_t c) {
// CHECK-LABEL: define {{[^@]+}}@test_vqtbx1q_p8
// CHECK-SAME: (<16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VTBX1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBX1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]])
// CHECK-NEXT: ret <16 x i8> [[VTBX1_I]]
//
poly8x16_t test_vqtbx1q_p8(poly8x16_t a, uint8x16_t b, uint8x16_t c) {
@@ -1652,7 +1652,7 @@ poly8x16_t test_vqtbx1q_p8(poly8x16_t a, uint8x16_t b, uint8x16_t c) {
// CHECK-NEXT: [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X2_T]], %struct.poly8x16x2_t* [[__P1_I]], i32 0, i32 0
// CHECK-NEXT: [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
// CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK-NEXT: [[VTBX2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBX2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[C]])
// CHECK-NEXT: ret <16 x i8> [[VTBX2_I]]
//
poly8x16_t test_vqtbx2q_p8(poly8x16_t a, poly8x16x2_t b, uint8x16_t c) {
@@ -1679,7 +1679,7 @@ poly8x16_t test_vqtbx2q_p8(poly8x16_t a, poly8x16x2_t b, uint8x16_t c) {
// CHECK-NEXT: [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X3_T]], %struct.poly8x16x3_t* [[__P1_I]], i32 0, i32 0
// CHECK-NEXT: [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
// CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
-// CHECK-NEXT: [[VTBX3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBX3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[C]])
// CHECK-NEXT: ret <16 x i8> [[VTBX3_I]]
//
poly8x16_t test_vqtbx3q_p8(poly8x16_t a, poly8x16x3_t b, uint8x16_t c) {
@@ -1709,7 +1709,7 @@ poly8x16_t test_vqtbx3q_p8(poly8x16_t a, poly8x16x3_t b, uint8x16_t c) {
// CHECK-NEXT: [[VAL5_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0
// CHECK-NEXT: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
// CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
-// CHECK-NEXT: [[VTBX4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT: [[VTBX4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[C]])
// CHECK-NEXT: ret <16 x i8> [[VTBX4_I]]
//
poly8x16_t test_vqtbx4q_p8(poly8x16_t a, poly8x16x4_t b, uint8x16_t c) {
diff --git a/clang/test/CodeGen/aarch64-poly128.c b/clang/test/CodeGen/aarch64-poly128.c
index 2e58c0f286e6f..c01fa24d697e9 100644
--- a/clang/test/CodeGen/aarch64-poly128.c
+++ b/clang/test/CodeGen/aarch64-poly128.c
@@ -60,7 +60,7 @@ void test_ld_st_p128(poly128_t * ptr) {
// CHECK-LABEL: define {{[^@]+}}@test_vmull_p64
// CHECK-SAME: (i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VMULL_P64_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 [[A]], i64 [[B]]) #[[ATTR3:[0-9]+]]
+// CHECK-NEXT: [[VMULL_P64_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 [[A]], i64 [[B]])
// CHECK-NEXT: [[VMULL_P641_I:%.*]] = bitcast <16 x i8> [[VMULL_P64_I]] to i128
// CHECK-NEXT: ret i128 [[VMULL_P641_I]]
//
@@ -75,7 +75,7 @@ poly128_t test_vmull_p64(poly64_t a, poly64_t b) {
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[SHUFFLE_I5]] to i64
// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[B]], <2 x i64> [[B]], <1 x i32> <i32 1>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I]] to i64
-// CHECK-NEXT: [[VMULL_P64_I_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 [[TMP0]], i64 [[TMP1]]) #[[ATTR3]]
+// CHECK-NEXT: [[VMULL_P64_I_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 [[TMP0]], i64 [[TMP1]])
// CHECK-NEXT: [[VMULL_P641_I_I:%.*]] = bitcast <16 x i8> [[VMULL_P64_I_I]] to i128
// CHECK-NEXT: ret i128 [[VMULL_P641_I_I]]
//
diff --git a/clang/test/CodeGen/aarch64-v8.1a-neon-intrinsics.c b/clang/test/CodeGen/aarch64-v8.1a-neon-intrinsics.c
index 418bbf19d84f4..9fce4c7a2a3ea 100644
--- a/clang/test/CodeGen/aarch64-v8.1a-neon-intrinsics.c
+++ b/clang/test/CodeGen/aarch64-v8.1a-neon-intrinsics.c
@@ -11,7 +11,7 @@
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) #[[ATTR4:[0-9]+]]
+// CHECK-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]])
// CHECK-NEXT: ret <4 x i16> [[VQRDMLAH_V3_I]]
//
int16x4_t test_vqrdmlah_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
@@ -23,7 +23,7 @@ int16x4_t test_vqrdmlah_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]])
// CHECK-NEXT: ret <2 x i32> [[VQRDMLAH_V3_I]]
//
int32x2_t test_vqrdmlah_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
@@ -35,7 +35,7 @@ int32x2_t test_vqrdmlah_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]])
// CHECK-NEXT: ret <8 x i16> [[VQRDMLAHQ_V3_I]]
//
int16x8_t test_vqrdmlahq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
@@ -47,7 +47,7 @@ int16x8_t test_vqrdmlahq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]])
// CHECK-NEXT: ret <4 x i32> [[VQRDMLAHQ_V3_I]]
//
int32x4_t test_vqrdmlahq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
@@ -59,7 +59,7 @@ int32x4_t test_vqrdmlahq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[A:%.*]], i64 0
// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i64 0
// CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i64 0
-// CHECK-NEXT: [[VQRDMLAHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQRDMLAHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
// CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i16> [[VQRDMLAHH_S16_I]], i64 0
// CHECK-NEXT: ret i16 [[TMP3]]
//
@@ -69,7 +69,7 @@ int16_t test_vqrdmlahh_s16(int16_t a, int16_t b, int16_t c) {
// CHECK-LABEL: @test_vqrdmlahs_s32(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VQRDMLAHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmlah.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQRDMLAHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmlah.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]])
// CHECK-NEXT: ret i32 [[VQRDMLAHS_S32_I]]
//
int32_t test_vqrdmlahs_s32(int32_t a, int32_t b, int32_t c) {
@@ -82,7 +82,7 @@ int32_t test_vqrdmlahs_s32(int32_t a, int32_t b, int32_t c) {
// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[A:%.*]], i64 0
// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i64 0
// CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i64 0
-// CHECK-NEXT: [[VQRDMLAHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQRDMLAHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
// CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i16> [[VQRDMLAHH_S16_I]], i64 0
// CHECK-NEXT: ret i16 [[TMP3]]
//
@@ -93,7 +93,7 @@ int16_t test_vqrdmlahh_lane_s16(int16_t a, int16_t b, int16x4_t c) {
// CHECK-LABEL: @test_vqrdmlahs_lane_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[C:%.*]], i32 1
-// CHECK-NEXT: [[VQRDMLAHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmlah.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[VGET_LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQRDMLAHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmlah.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[VGET_LANE]])
// CHECK-NEXT: ret i32 [[VQRDMLAHS_S32_I]]
//
int32_t test_vqrdmlahs_lane_s32(int32_t a, int32_t b, int32x2_t c) {
@@ -106,7 +106,7 @@ int32_t test_vqrdmlahs_lane_s32(int32_t a, int32_t b, int32x2_t c) {
// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[A:%.*]], i64 0
// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i64 0
// CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 [[VGETQ_LANE]], i64 0
-// CHECK-NEXT: [[VQRDMLAHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQRDMLAHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
// CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i16> [[VQRDMLAHH_S16_I]], i64 0
// CHECK-NEXT: ret i16 [[TMP3]]
//
@@ -117,7 +117,7 @@ int16_t test_vqrdmlahh_laneq_s16(int16_t a, int16_t b, int16x8_t c) {
// CHECK-LABEL: @test_vqrdmlahs_laneq_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 3
-// CHECK-NEXT: [[VQRDMLAHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmlah.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[VGETQ_LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQRDMLAHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmlah.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[VGETQ_LANE]])
// CHECK-NEXT: ret i32 [[VQRDMLAHS_S32_I]]
//
int32_t test_vqrdmlahs_laneq_s32(int32_t a, int32_t b, int32x4_t c) {
@@ -129,7 +129,7 @@ int32_t test_vqrdmlahs_laneq_s32(int32_t a, int32_t b, int32x4_t c) {
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]])
// CHECK-NEXT: ret <4 x i16> [[VQRDMLSH_V3_I]]
//
int16x4_t test_vqrdmlsh_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
@@ -141,7 +141,7 @@ int16x4_t test_vqrdmlsh_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]])
// CHECK-NEXT: ret <2 x i32> [[VQRDMLSH_V3_I]]
//
int32x2_t test_vqrdmlsh_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
@@ -153,7 +153,7 @@ int32x2_t test_vqrdmlsh_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]])
// CHECK-NEXT: ret <8 x i16> [[VQRDMLSHQ_V3_I]]
//
int16x8_t test_vqrdmlshq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
@@ -165,7 +165,7 @@ int16x8_t test_vqrdmlshq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]])
// CHECK-NEXT: ret <4 x i32> [[VQRDMLSHQ_V3_I]]
//
int32x4_t test_vqrdmlshq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
@@ -177,7 +177,7 @@ int32x4_t test_vqrdmlshq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[A:%.*]], i64 0
// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i64 0
// CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i64 0
-// CHECK-NEXT: [[VQRDMLSHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQRDMLSHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
// CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i16> [[VQRDMLSHH_S16_I]], i64 0
// CHECK-NEXT: ret i16 [[TMP3]]
//
@@ -187,7 +187,7 @@ int16_t test_vqrdmlshh_s16(int16_t a, int16_t b, int16_t c) {
// CHECK-LABEL: @test_vqrdmlshs_s32(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[VQRDMLSHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmlsh.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQRDMLSHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmlsh.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]])
// CHECK-NEXT: ret i32 [[VQRDMLSHS_S32_I]]
//
int32_t test_vqrdmlshs_s32(int32_t a, int32_t b, int32_t c) {
@@ -200,7 +200,7 @@ int32_t test_vqrdmlshs_s32(int32_t a, int32_t b, int32_t c) {
// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[A:%.*]], i64 0
// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i64 0
// CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i64 0
-// CHECK-NEXT: [[VQRDMLSHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQRDMLSHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
// CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i16> [[VQRDMLSHH_S16_I]], i64 0
// CHECK-NEXT: ret i16 [[TMP3]]
//
@@ -211,7 +211,7 @@ int16_t test_vqrdmlshh_lane_s16(int16_t a, int16_t b, int16x4_t c) {
// CHECK-LABEL: @test_vqrdmlshs_lane_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <2 x i32> [[C:%.*]], i32 1
-// CHECK-NEXT: [[VQRDMLSHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmlsh.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[VGET_LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQRDMLSHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmlsh.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[VGET_LANE]])
// CHECK-NEXT: ret i32 [[VQRDMLSHS_S32_I]]
//
int32_t test_vqrdmlshs_lane_s32(int32_t a, int32_t b, int32x2_t c) {
@@ -224,7 +224,7 @@ int32_t test_vqrdmlshs_lane_s32(int32_t a, int32_t b, int32x2_t c) {
// CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[A:%.*]], i64 0
// CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i64 0
// CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 [[VGETQ_LANE]], i64 0
-// CHECK-NEXT: [[VQRDMLSHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQRDMLSHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
// CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i16> [[VQRDMLSHH_S16_I]], i64 0
// CHECK-NEXT: ret i16 [[TMP3]]
//
@@ -235,7 +235,7 @@ int16_t test_vqrdmlshh_laneq_s16(int16_t a, int16_t b, int16x8_t c) {
// CHECK-LABEL: @test_vqrdmlshs_laneq_s32(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 3
-// CHECK-NEXT: [[VQRDMLSHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmlsh.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[VGETQ_LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VQRDMLSHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmlsh.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[VGETQ_LANE]])
// CHECK-NEXT: ret i32 [[VQRDMLSHS_S32_I]]
//
int32_t test_vqrdmlshs_laneq_s32(int32_t a, int32_t b, int32x4_t c) {
diff --git a/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c b/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
index 3b378527f9cef..08e7fecd1330f 100644
--- a/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
+++ b/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
@@ -12,7 +12,7 @@
// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT: [[VABS1_I:%.*]] = call <4 x half> @llvm.fabs.v4f16(<4 x half> [[A]]) #[[ATTR4:[0-9]+]]
+// CHECK-NEXT: [[VABS1_I:%.*]] = call <4 x half> @llvm.fabs.v4f16(<4 x half> [[A]])
// CHECK-NEXT: ret <4 x half> [[VABS1_I]]
//
float16x4_t test_vabs_f16(float16x4_t a) {
@@ -23,7 +23,7 @@ float16x4_t test_vabs_f16(float16x4_t a) {
// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1:[0-9]+]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT: [[VABS1_I:%.*]] = call <8 x half> @llvm.fabs.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[VABS1_I:%.*]] = call <8 x half> @llvm.fabs.v8f16(<8 x half> [[A]])
// CHECK-NEXT: ret <8 x half> [[VABS1_I]]
//
float16x8_t test_vabsq_f16(float16x8_t a) {
@@ -198,7 +198,7 @@ float16x8_t test_vcvtq_f16_u16 (uint16x8_t a) {
// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT: [[VCVTZ1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtzs.v4i16.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[VCVTZ1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtzs.v4i16.v4f16(<4 x half> [[A]])
// CHECK-NEXT: ret <4 x i16> [[VCVTZ1_I]]
//
int16x4_t test_vcvt_s16_f16 (float16x4_t a) {
@@ -209,7 +209,7 @@ int16x4_t test_vcvt_s16_f16 (float16x4_t a) {
// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT: [[VCVTZ1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtzs.v8i16.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[VCVTZ1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtzs.v8i16.v8f16(<8 x half> [[A]])
// CHECK-NEXT: ret <8 x i16> [[VCVTZ1_I]]
//
int16x8_t test_vcvtq_s16_f16 (float16x8_t a) {
@@ -220,7 +220,7 @@ int16x8_t test_vcvtq_s16_f16 (float16x8_t a) {
// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT: [[VCVTZ1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtzu.v4i16.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[VCVTZ1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtzu.v4i16.v4f16(<4 x half> [[A]])
// CHECK-NEXT: ret <4 x i16> [[VCVTZ1_I]]
//
uint16x4_t test_vcvt_u16_f16 (float16x4_t a) {
@@ -231,7 +231,7 @@ uint16x4_t test_vcvt_u16_f16 (float16x4_t a) {
// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT: [[VCVTZ1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtzu.v8i16.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[VCVTZ1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtzu.v8i16.v8f16(<8 x half> [[A]])
// CHECK-NEXT: ret <8 x i16> [[VCVTZ1_I]]
//
uint16x8_t test_vcvtq_u16_f16 (float16x8_t a) {
@@ -242,7 +242,7 @@ uint16x8_t test_vcvtq_u16_f16 (float16x8_t a) {
// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT: [[VCVTA1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtas.v4i16.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[VCVTA1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtas.v4i16.v4f16(<4 x half> [[A]])
// CHECK-NEXT: ret <4 x i16> [[VCVTA1_I]]
//
int16x4_t test_vcvta_s16_f16 (float16x4_t a) {
@@ -253,7 +253,7 @@ int16x4_t test_vcvta_s16_f16 (float16x4_t a) {
// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT: [[VCVTA1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtau.v4i16.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[VCVTA1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtau.v4i16.v4f16(<4 x half> [[A]])
// CHECK-NEXT: ret <4 x i16> [[VCVTA1_I]]
//
uint16x4_t test_vcvta_u16_f16 (float16x4_t a) {
@@ -264,7 +264,7 @@ uint16x4_t test_vcvta_u16_f16 (float16x4_t a) {
// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT: [[VCVTA1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtas.v8i16.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[VCVTA1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtas.v8i16.v8f16(<8 x half> [[A]])
// CHECK-NEXT: ret <8 x i16> [[VCVTA1_I]]
//
int16x8_t test_vcvtaq_s16_f16 (float16x8_t a) {
@@ -275,7 +275,7 @@ int16x8_t test_vcvtaq_s16_f16 (float16x8_t a) {
// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT: [[VCVTM1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtms.v4i16.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[VCVTM1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtms.v4i16.v4f16(<4 x half> [[A]])
// CHECK-NEXT: ret <4 x i16> [[VCVTM1_I]]
//
int16x4_t test_vcvtm_s16_f16 (float16x4_t a) {
@@ -286,7 +286,7 @@ int16x4_t test_vcvtm_s16_f16 (float16x4_t a) {
// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT: [[VCVTM1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtms.v8i16.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[VCVTM1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtms.v8i16.v8f16(<8 x half> [[A]])
// CHECK-NEXT: ret <8 x i16> [[VCVTM1_I]]
//
int16x8_t test_vcvtmq_s16_f16 (float16x8_t a) {
@@ -297,7 +297,7 @@ int16x8_t test_vcvtmq_s16_f16 (float16x8_t a) {
// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT: [[VCVTM1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtmu.v4i16.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[VCVTM1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtmu.v4i16.v4f16(<4 x half> [[A]])
// CHECK-NEXT: ret <4 x i16> [[VCVTM1_I]]
//
uint16x4_t test_vcvtm_u16_f16 (float16x4_t a) {
@@ -308,7 +308,7 @@ uint16x4_t test_vcvtm_u16_f16 (float16x4_t a) {
// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT: [[VCVTM1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtmu.v8i16.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[VCVTM1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtmu.v8i16.v8f16(<8 x half> [[A]])
// CHECK-NEXT: ret <8 x i16> [[VCVTM1_I]]
//
uint16x8_t test_vcvtmq_u16_f16 (float16x8_t a) {
@@ -319,7 +319,7 @@ uint16x8_t test_vcvtmq_u16_f16 (float16x8_t a) {
// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT: [[VCVTN1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtns.v4i16.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[VCVTN1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtns.v4i16.v4f16(<4 x half> [[A]])
// CHECK-NEXT: ret <4 x i16> [[VCVTN1_I]]
//
int16x4_t test_vcvtn_s16_f16 (float16x4_t a) {
@@ -330,7 +330,7 @@ int16x4_t test_vcvtn_s16_f16 (float16x4_t a) {
// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT: [[VCVTN1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtns.v8i16.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[VCVTN1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtns.v8i16.v8f16(<8 x half> [[A]])
// CHECK-NEXT: ret <8 x i16> [[VCVTN1_I]]
//
int16x8_t test_vcvtnq_s16_f16 (float16x8_t a) {
@@ -341,7 +341,7 @@ int16x8_t test_vcvtnq_s16_f16 (float16x8_t a) {
// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT: [[VCVTN1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtnu.v4i16.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[VCVTN1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtnu.v4i16.v4f16(<4 x half> [[A]])
// CHECK-NEXT: ret <4 x i16> [[VCVTN1_I]]
//
uint16x4_t test_vcvtn_u16_f16 (float16x4_t a) {
@@ -352,7 +352,7 @@ uint16x4_t test_vcvtn_u16_f16 (float16x4_t a) {
// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT: [[VCVTN1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtnu.v8i16.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[VCVTN1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtnu.v8i16.v8f16(<8 x half> [[A]])
// CHECK-NEXT: ret <8 x i16> [[VCVTN1_I]]
//
uint16x8_t test_vcvtnq_u16_f16 (float16x8_t a) {
@@ -363,7 +363,7 @@ uint16x8_t test_vcvtnq_u16_f16 (float16x8_t a) {
// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT: [[VCVTP1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtps.v4i16.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[VCVTP1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtps.v4i16.v4f16(<4 x half> [[A]])
// CHECK-NEXT: ret <4 x i16> [[VCVTP1_I]]
//
int16x4_t test_vcvtp_s16_f16 (float16x4_t a) {
@@ -374,7 +374,7 @@ int16x4_t test_vcvtp_s16_f16 (float16x4_t a) {
// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT: [[VCVTP1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtps.v8i16.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[VCVTP1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtps.v8i16.v8f16(<8 x half> [[A]])
// CHECK-NEXT: ret <8 x i16> [[VCVTP1_I]]
//
int16x8_t test_vcvtpq_s16_f16 (float16x8_t a) {
@@ -385,7 +385,7 @@ int16x8_t test_vcvtpq_s16_f16 (float16x8_t a) {
// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT: [[VCVTP1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtpu.v4i16.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[VCVTP1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtpu.v4i16.v4f16(<4 x half> [[A]])
// CHECK-NEXT: ret <4 x i16> [[VCVTP1_I]]
//
uint16x4_t test_vcvtp_u16_f16 (float16x4_t a) {
@@ -396,7 +396,7 @@ uint16x4_t test_vcvtp_u16_f16 (float16x4_t a) {
// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT: [[VCVTP1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtpu.v8i16.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[VCVTP1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtpu.v8i16.v8f16(<8 x half> [[A]])
// CHECK-NEXT: ret <8 x i16> [[VCVTP1_I]]
//
uint16x8_t test_vcvtpq_u16_f16 (float16x8_t a) {
@@ -428,7 +428,7 @@ float16x8_t test_vnegq_f16(float16x8_t a) {
// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT: [[VRECPE_V1_I:%.*]] = call <4 x half> @llvm.aarch64.neon.frecpe.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[VRECPE_V1_I:%.*]] = call <4 x half> @llvm.aarch64.neon.frecpe.v4f16(<4 x half> [[A]])
// CHECK-NEXT: ret <4 x half> [[VRECPE_V1_I]]
//
float16x4_t test_vrecpe_f16(float16x4_t a) {
@@ -439,7 +439,7 @@ float16x4_t test_vrecpe_f16(float16x4_t a) {
// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT: [[VRECPEQ_V1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.frecpe.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[VRECPEQ_V1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.frecpe.v8f16(<8 x half> [[A]])
// CHECK-NEXT: ret <8 x half> [[VRECPEQ_V1_I]]
//
float16x8_t test_vrecpeq_f16(float16x8_t a) {
@@ -450,7 +450,7 @@ float16x8_t test_vrecpeq_f16(float16x8_t a) {
// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT: [[VRNDZ1_I:%.*]] = call <4 x half> @llvm.trunc.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[VRNDZ1_I:%.*]] = call <4 x half> @llvm.trunc.v4f16(<4 x half> [[A]])
// CHECK-NEXT: ret <4 x half> [[VRNDZ1_I]]
//
float16x4_t test_vrnd_f16(float16x4_t a) {
@@ -461,7 +461,7 @@ float16x4_t test_vrnd_f16(float16x4_t a) {
// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT: [[VRNDZ1_I:%.*]] = call <8 x half> @llvm.trunc.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[VRNDZ1_I:%.*]] = call <8 x half> @llvm.trunc.v8f16(<8 x half> [[A]])
// CHECK-NEXT: ret <8 x half> [[VRNDZ1_I]]
//
float16x8_t test_vrndq_f16(float16x8_t a) {
@@ -472,7 +472,7 @@ float16x8_t test_vrndq_f16(float16x8_t a) {
// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT: [[VRNDA1_I:%.*]] = call <4 x half> @llvm.round.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[VRNDA1_I:%.*]] = call <4 x half> @llvm.round.v4f16(<4 x half> [[A]])
// CHECK-NEXT: ret <4 x half> [[VRNDA1_I]]
//
float16x4_t test_vrnda_f16(float16x4_t a) {
@@ -483,7 +483,7 @@ float16x4_t test_vrnda_f16(float16x4_t a) {
// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT: [[VRNDA1_I:%.*]] = call <8 x half> @llvm.round.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[VRNDA1_I:%.*]] = call <8 x half> @llvm.round.v8f16(<8 x half> [[A]])
// CHECK-NEXT: ret <8 x half> [[VRNDA1_I]]
//
float16x8_t test_vrndaq_f16(float16x8_t a) {
@@ -494,7 +494,7 @@ float16x8_t test_vrndaq_f16(float16x8_t a) {
// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT: [[VRNDI_V1_I:%.*]] = call <4 x half> @llvm.nearbyint.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[VRNDI_V1_I:%.*]] = call <4 x half> @llvm.nearbyint.v4f16(<4 x half> [[A]])
// CHECK-NEXT: ret <4 x half> [[VRNDI_V1_I]]
//
float16x4_t test_vrndi_f16(float16x4_t a) {
@@ -505,7 +505,7 @@ float16x4_t test_vrndi_f16(float16x4_t a) {
// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT: [[VRNDIQ_V1_I:%.*]] = call <8 x half> @llvm.nearbyint.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[VRNDIQ_V1_I:%.*]] = call <8 x half> @llvm.nearbyint.v8f16(<8 x half> [[A]])
// CHECK-NEXT: ret <8 x half> [[VRNDIQ_V1_I]]
//
float16x8_t test_vrndiq_f16(float16x8_t a) {
@@ -516,7 +516,7 @@ float16x8_t test_vrndiq_f16(float16x8_t a) {
// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT: [[VRNDM1_I:%.*]] = call <4 x half> @llvm.floor.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[VRNDM1_I:%.*]] = call <4 x half> @llvm.floor.v4f16(<4 x half> [[A]])
// CHECK-NEXT: ret <4 x half> [[VRNDM1_I]]
//
float16x4_t test_vrndm_f16(float16x4_t a) {
@@ -527,7 +527,7 @@ float16x4_t test_vrndm_f16(float16x4_t a) {
// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT: [[VRNDM1_I:%.*]] = call <8 x half> @llvm.floor.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[VRNDM1_I:%.*]] = call <8 x half> @llvm.floor.v8f16(<8 x half> [[A]])
// CHECK-NEXT: ret <8 x half> [[VRNDM1_I]]
//
float16x8_t test_vrndmq_f16(float16x8_t a) {
@@ -538,7 +538,7 @@ float16x8_t test_vrndmq_f16(float16x8_t a) {
// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT: [[VRNDN1_I:%.*]] = call <4 x half> @llvm.roundeven.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[VRNDN1_I:%.*]] = call <4 x half> @llvm.roundeven.v4f16(<4 x half> [[A]])
// CHECK-NEXT: ret <4 x half> [[VRNDN1_I]]
//
float16x4_t test_vrndn_f16(float16x4_t a) {
@@ -549,7 +549,7 @@ float16x4_t test_vrndn_f16(float16x4_t a) {
// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT: [[VRNDN1_I:%.*]] = call <8 x half> @llvm.roundeven.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[VRNDN1_I:%.*]] = call <8 x half> @llvm.roundeven.v8f16(<8 x half> [[A]])
// CHECK-NEXT: ret <8 x half> [[VRNDN1_I]]
//
float16x8_t test_vrndnq_f16(float16x8_t a) {
@@ -560,7 +560,7 @@ float16x8_t test_vrndnq_f16(float16x8_t a) {
// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT: [[VRNDP1_I:%.*]] = call <4 x half> @llvm.ceil.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[VRNDP1_I:%.*]] = call <4 x half> @llvm.ceil.v4f16(<4 x half> [[A]])
// CHECK-NEXT: ret <4 x half> [[VRNDP1_I]]
//
float16x4_t test_vrndp_f16(float16x4_t a) {
@@ -571,7 +571,7 @@ float16x4_t test_vrndp_f16(float16x4_t a) {
// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT: [[VRNDP1_I:%.*]] = call <8 x half> @llvm.ceil.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[VRNDP1_I:%.*]] = call <8 x half> @llvm.ceil.v8f16(<8 x half> [[A]])
// CHECK-NEXT: ret <8 x half> [[VRNDP1_I]]
//
float16x8_t test_vrndpq_f16(float16x8_t a) {
@@ -582,7 +582,7 @@ float16x8_t test_vrndpq_f16(float16x8_t a) {
// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT: [[VRNDX1_I:%.*]] = call <4 x half> @llvm.rint.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[VRNDX1_I:%.*]] = call <4 x half> @llvm.rint.v4f16(<4 x half> [[A]])
// CHECK-NEXT: ret <4 x half> [[VRNDX1_I]]
//
float16x4_t test_vrndx_f16(float16x4_t a) {
@@ -593,7 +593,7 @@ float16x4_t test_vrndx_f16(float16x4_t a) {
// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT: [[VRNDX1_I:%.*]] = call <8 x half> @llvm.rint.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[VRNDX1_I:%.*]] = call <8 x half> @llvm.rint.v8f16(<8 x half> [[A]])
// CHECK-NEXT: ret <8 x half> [[VRNDX1_I]]
//
float16x8_t test_vrndxq_f16(float16x8_t a) {
@@ -604,7 +604,7 @@ float16x8_t test_vrndxq_f16(float16x8_t a) {
// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT: [[VRSQRTE_V1_I:%.*]] = call <4 x half> @llvm.aarch64.neon.frsqrte.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[VRSQRTE_V1_I:%.*]] = call <4 x half> @llvm.aarch64.neon.frsqrte.v4f16(<4 x half> [[A]])
// CHECK-NEXT: ret <4 x half> [[VRSQRTE_V1_I]]
//
float16x4_t test_vrsqrte_f16(float16x4_t a) {
@@ -615,7 +615,7 @@ float16x4_t test_vrsqrte_f16(float16x4_t a) {
// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT: [[VRSQRTEQ_V1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.frsqrte.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[VRSQRTEQ_V1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.frsqrte.v8f16(<8 x half> [[A]])
// CHECK-NEXT: ret <8 x half> [[VRSQRTEQ_V1_I]]
//
float16x8_t test_vrsqrteq_f16(float16x8_t a) {
@@ -626,7 +626,7 @@ float16x8_t test_vrsqrteq_f16(float16x8_t a) {
// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT: [[VSQRT_I:%.*]] = call <4 x half> @llvm.sqrt.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[VSQRT_I:%.*]] = call <4 x half> @llvm.sqrt.v4f16(<4 x half> [[A]])
// CHECK-NEXT: ret <4 x half> [[VSQRT_I]]
//
float16x4_t test_vsqrt_f16(float16x4_t a) {
@@ -637,7 +637,7 @@ float16x4_t test_vsqrt_f16(float16x4_t a) {
// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT: [[VSQRT_I:%.*]] = call <8 x half> @llvm.sqrt.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[VSQRT_I:%.*]] = call <8 x half> @llvm.sqrt.v8f16(<8 x half> [[A]])
// CHECK-NEXT: ret <8 x half> [[VSQRT_I]]
//
float16x8_t test_vsqrtq_f16(float16x8_t a) {
@@ -669,7 +669,7 @@ float16x8_t test_vaddq_f16(float16x8_t a, float16x8_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
-// CHECK-NEXT: [[VABD2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fabd.v4f16(<4 x half> [[A]], <4 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT: [[VABD2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fabd.v4f16(<4 x half> [[A]], <4 x half> [[B]])
// CHECK-NEXT: ret <4 x half> [[VABD2_I]]
//
float16x4_t test_vabd_f16(float16x4_t a, float16x4_t b) {
@@ -681,7 +681,7 @@ float16x4_t test_vabd_f16(float16x4_t a, float16x4_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
-// CHECK-NEXT: [[VABD2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fabd.v8f16(<8 x half> [[A]], <8 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT: [[VABD2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fabd.v8f16(<8 x half> [[A]], <8 x half> [[B]])
// CHECK-NEXT: ret <8 x half> [[VABD2_I]]
//
float16x8_t test_vabdq_f16(float16x8_t a, float16x8_t b) {
@@ -693,7 +693,7 @@ float16x8_t test_vabdq_f16(float16x8_t a, float16x8_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
-// CHECK-NEXT: [[VCAGE_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.facge.v4i16.v4f16(<4 x half> [[A]], <4 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT: [[VCAGE_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.facge.v4i16.v4f16(<4 x half> [[A]], <4 x half> [[B]])
// CHECK-NEXT: ret <4 x i16> [[VCAGE_V2_I]]
//
uint16x4_t test_vcage_f16(float16x4_t a, float16x4_t b) {
@@ -705,7 +705,7 @@ uint16x4_t test_vcage_f16(float16x4_t a, float16x4_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
-// CHECK-NEXT: [[VCAGEQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.facge.v8i16.v8f16(<8 x half> [[A]], <8 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT: [[VCAGEQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.facge.v8i16.v8f16(<8 x half> [[A]], <8 x half> [[B]])
// CHECK-NEXT: ret <8 x i16> [[VCAGEQ_V2_I]]
//
uint16x8_t test_vcageq_f16(float16x8_t a, float16x8_t b) {
@@ -717,7 +717,7 @@ uint16x8_t test_vcageq_f16(float16x8_t a, float16x8_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
-// CHECK-NEXT: [[VCAGT_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.facgt.v4i16.v4f16(<4 x half> [[A]], <4 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT: [[VCAGT_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.facgt.v4i16.v4f16(<4 x half> [[A]], <4 x half> [[B]])
// CHECK-NEXT: ret <4 x i16> [[VCAGT_V2_I]]
//
uint16x4_t test_vcagt_f16(float16x4_t a, float16x4_t b) {
@@ -729,7 +729,7 @@ uint16x4_t test_vcagt_f16(float16x4_t a, float16x4_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
-// CHECK-NEXT: [[VCAGTQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.facgt.v8i16.v8f16(<8 x half> [[A]], <8 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT: [[VCAGTQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.facgt.v8i16.v8f16(<8 x half> [[A]], <8 x half> [[B]])
// CHECK-NEXT: ret <8 x i16> [[VCAGTQ_V2_I]]
//
uint16x8_t test_vcagtq_f16(float16x8_t a, float16x8_t b) {
@@ -741,7 +741,7 @@ uint16x8_t test_vcagtq_f16(float16x8_t a, float16x8_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
-// CHECK-NEXT: [[VCALE_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.facge.v4i16.v4f16(<4 x half> [[B]], <4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[VCALE_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.facge.v4i16.v4f16(<4 x half> [[B]], <4 x half> [[A]])
// CHECK-NEXT: ret <4 x i16> [[VCALE_V2_I]]
//
uint16x4_t test_vcale_f16(float16x4_t a, float16x4_t b) {
@@ -753,7 +753,7 @@ uint16x4_t test_vcale_f16(float16x4_t a, float16x4_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
-// CHECK-NEXT: [[VCALEQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.facge.v8i16.v8f16(<8 x half> [[B]], <8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[VCALEQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.facge.v8i16.v8f16(<8 x half> [[B]], <8 x half> [[A]])
// CHECK-NEXT: ret <8 x i16> [[VCALEQ_V2_I]]
//
uint16x8_t test_vcaleq_f16(float16x8_t a, float16x8_t b) {
@@ -765,7 +765,7 @@ uint16x8_t test_vcaleq_f16(float16x8_t a, float16x8_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
-// CHECK-NEXT: [[VCALT_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.facgt.v4i16.v4f16(<4 x half> [[B]], <4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[VCALT_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.facgt.v4i16.v4f16(<4 x half> [[B]], <4 x half> [[A]])
// CHECK-NEXT: ret <4 x i16> [[VCALT_V2_I]]
//
uint16x4_t test_vcalt_f16(float16x4_t a, float16x4_t b) {
@@ -777,7 +777,7 @@ uint16x4_t test_vcalt_f16(float16x4_t a, float16x4_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
-// CHECK-NEXT: [[VCALTQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.facgt.v8i16.v8f16(<8 x half> [[B]], <8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[VCALTQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.facgt.v8i16.v8f16(<8 x half> [[B]], <8 x half> [[A]])
// CHECK-NEXT: ret <8 x i16> [[VCALTQ_V2_I]]
//
uint16x8_t test_vcaltq_f16(float16x8_t a, float16x8_t b) {
@@ -1015,7 +1015,7 @@ float16x8_t test_vdivq_f16(float16x8_t a, float16x8_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
-// CHECK-NEXT: [[VMAX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmax.v4f16(<4 x half> [[A]], <4 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMAX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmax.v4f16(<4 x half> [[A]], <4 x half> [[B]])
// CHECK-NEXT: ret <4 x half> [[VMAX2_I]]
//
float16x4_t test_vmax_f16(float16x4_t a, float16x4_t b) {
@@ -1027,7 +1027,7 @@ float16x4_t test_vmax_f16(float16x4_t a, float16x4_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
-// CHECK-NEXT: [[VMAX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmax.v8f16(<8 x half> [[A]], <8 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMAX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmax.v8f16(<8 x half> [[A]], <8 x half> [[B]])
// CHECK-NEXT: ret <8 x half> [[VMAX2_I]]
//
float16x8_t test_vmaxq_f16(float16x8_t a, float16x8_t b) {
@@ -1039,7 +1039,7 @@ float16x8_t test_vmaxq_f16(float16x8_t a, float16x8_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
-// CHECK-NEXT: [[VMAXNM2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmaxnm.v4f16(<4 x half> [[A]], <4 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMAXNM2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmaxnm.v4f16(<4 x half> [[A]], <4 x half> [[B]])
// CHECK-NEXT: ret <4 x half> [[VMAXNM2_I]]
//
float16x4_t test_vmaxnm_f16(float16x4_t a, float16x4_t b) {
@@ -1051,7 +1051,7 @@ float16x4_t test_vmaxnm_f16(float16x4_t a, float16x4_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
-// CHECK-NEXT: [[VMAXNM2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmaxnm.v8f16(<8 x half> [[A]], <8 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMAXNM2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmaxnm.v8f16(<8 x half> [[A]], <8 x half> [[B]])
// CHECK-NEXT: ret <8 x half> [[VMAXNM2_I]]
//
float16x8_t test_vmaxnmq_f16(float16x8_t a, float16x8_t b) {
@@ -1063,7 +1063,7 @@ float16x8_t test_vmaxnmq_f16(float16x8_t a, float16x8_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
-// CHECK-NEXT: [[VMIN2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmin.v4f16(<4 x half> [[A]], <4 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMIN2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmin.v4f16(<4 x half> [[A]], <4 x half> [[B]])
// CHECK-NEXT: ret <4 x half> [[VMIN2_I]]
//
float16x4_t test_vmin_f16(float16x4_t a, float16x4_t b) {
@@ -1075,7 +1075,7 @@ float16x4_t test_vmin_f16(float16x4_t a, float16x4_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
-// CHECK-NEXT: [[VMIN2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmin.v8f16(<8 x half> [[A]], <8 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMIN2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmin.v8f16(<8 x half> [[A]], <8 x half> [[B]])
// CHECK-NEXT: ret <8 x half> [[VMIN2_I]]
//
float16x8_t test_vminq_f16(float16x8_t a, float16x8_t b) {
@@ -1087,7 +1087,7 @@ float16x8_t test_vminq_f16(float16x8_t a, float16x8_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
-// CHECK-NEXT: [[VMINNM2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fminnm.v4f16(<4 x half> [[A]], <4 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMINNM2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fminnm.v4f16(<4 x half> [[A]], <4 x half> [[B]])
// CHECK-NEXT: ret <4 x half> [[VMINNM2_I]]
//
float16x4_t test_vminnm_f16(float16x4_t a, float16x4_t b) {
@@ -1099,7 +1099,7 @@ float16x4_t test_vminnm_f16(float16x4_t a, float16x4_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
-// CHECK-NEXT: [[VMINNM2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fminnm.v8f16(<8 x half> [[A]], <8 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMINNM2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fminnm.v8f16(<8 x half> [[A]], <8 x half> [[B]])
// CHECK-NEXT: ret <8 x half> [[VMINNM2_I]]
//
float16x8_t test_vminnmq_f16(float16x8_t a, float16x8_t b) {
@@ -1131,7 +1131,7 @@ float16x8_t test_vmulq_f16(float16x8_t a, float16x8_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
-// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> [[A]], <4 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> [[A]], <4 x half> [[B]])
// CHECK-NEXT: ret <4 x half> [[VMULX2_I]]
//
float16x4_t test_vmulx_f16(float16x4_t a, float16x4_t b) {
@@ -1143,7 +1143,7 @@ float16x4_t test_vmulx_f16(float16x4_t a, float16x4_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
-// CHECK-NEXT: [[VMULX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> [[A]], <8 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> [[A]], <8 x half> [[B]])
// CHECK-NEXT: ret <8 x half> [[VMULX2_I]]
//
float16x8_t test_vmulxq_f16(float16x8_t a, float16x8_t b) {
@@ -1155,7 +1155,7 @@ float16x8_t test_vmulxq_f16(float16x8_t a, float16x8_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
-// CHECK-NEXT: [[VPADD_V2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.faddp.v4f16(<4 x half> [[A]], <4 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT: [[VPADD_V2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.faddp.v4f16(<4 x half> [[A]], <4 x half> [[B]])
// CHECK-NEXT: [[VPADD_V3_I:%.*]] = bitcast <4 x half> [[VPADD_V2_I]] to <8 x i8>
// CHECK-NEXT: ret <4 x half> [[VPADD_V2_I]]
//
@@ -1168,7 +1168,7 @@ float16x4_t test_vpadd_f16(float16x4_t a, float16x4_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
-// CHECK-NEXT: [[VPADDQ_V2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.faddp.v8f16(<8 x half> [[A]], <8 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT: [[VPADDQ_V2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.faddp.v8f16(<8 x half> [[A]], <8 x half> [[B]])
// CHECK-NEXT: [[VPADDQ_V3_I:%.*]] = bitcast <8 x half> [[VPADDQ_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <8 x half> [[VPADDQ_V2_I]]
//
@@ -1181,7 +1181,7 @@ float16x8_t test_vpaddq_f16(float16x8_t a, float16x8_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
-// CHECK-NEXT: [[VPMAX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmaxp.v4f16(<4 x half> [[A]], <4 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT: [[VPMAX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmaxp.v4f16(<4 x half> [[A]], <4 x half> [[B]])
// CHECK-NEXT: ret <4 x half> [[VPMAX2_I]]
//
float16x4_t test_vpmax_f16(float16x4_t a, float16x4_t b) {
@@ -1193,7 +1193,7 @@ float16x4_t test_vpmax_f16(float16x4_t a, float16x4_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
-// CHECK-NEXT: [[VPMAX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmaxp.v8f16(<8 x half> [[A]], <8 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT: [[VPMAX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmaxp.v8f16(<8 x half> [[A]], <8 x half> [[B]])
// CHECK-NEXT: ret <8 x half> [[VPMAX2_I]]
//
float16x8_t test_vpmaxq_f16(float16x8_t a, float16x8_t b) {
@@ -1205,7 +1205,7 @@ float16x8_t test_vpmaxq_f16(float16x8_t a, float16x8_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
-// CHECK-NEXT: [[VPMAXNM2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmaxnmp.v4f16(<4 x half> [[A]], <4 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT: [[VPMAXNM2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmaxnmp.v4f16(<4 x half> [[A]], <4 x half> [[B]])
// CHECK-NEXT: ret <4 x half> [[VPMAXNM2_I]]
//
float16x4_t test_vpmaxnm_f16(float16x4_t a, float16x4_t b) {
@@ -1217,7 +1217,7 @@ float16x4_t test_vpmaxnm_f16(float16x4_t a, float16x4_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
-// CHECK-NEXT: [[VPMAXNM2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmaxnmp.v8f16(<8 x half> [[A]], <8 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT: [[VPMAXNM2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmaxnmp.v8f16(<8 x half> [[A]], <8 x half> [[B]])
// CHECK-NEXT: ret <8 x half> [[VPMAXNM2_I]]
//
float16x8_t test_vpmaxnmq_f16(float16x8_t a, float16x8_t b) {
@@ -1229,7 +1229,7 @@ float16x8_t test_vpmaxnmq_f16(float16x8_t a, float16x8_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
-// CHECK-NEXT: [[VPMIN2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fminp.v4f16(<4 x half> [[A]], <4 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT: [[VPMIN2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fminp.v4f16(<4 x half> [[A]], <4 x half> [[B]])
// CHECK-NEXT: ret <4 x half> [[VPMIN2_I]]
//
float16x4_t test_vpmin_f16(float16x4_t a, float16x4_t b) {
@@ -1241,7 +1241,7 @@ float16x4_t test_vpmin_f16(float16x4_t a, float16x4_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
-// CHECK-NEXT: [[VPMIN2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fminp.v8f16(<8 x half> [[A]], <8 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT: [[VPMIN2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fminp.v8f16(<8 x half> [[A]], <8 x half> [[B]])
// CHECK-NEXT: ret <8 x half> [[VPMIN2_I]]
//
float16x8_t test_vpminq_f16(float16x8_t a, float16x8_t b) {
@@ -1253,7 +1253,7 @@ float16x8_t test_vpminq_f16(float16x8_t a, float16x8_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
-// CHECK-NEXT: [[VPMINNM2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fminnmp.v4f16(<4 x half> [[A]], <4 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT: [[VPMINNM2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fminnmp.v4f16(<4 x half> [[A]], <4 x half> [[B]])
// CHECK-NEXT: ret <4 x half> [[VPMINNM2_I]]
//
float16x4_t test_vpminnm_f16(float16x4_t a, float16x4_t b) {
@@ -1265,7 +1265,7 @@ float16x4_t test_vpminnm_f16(float16x4_t a, float16x4_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
-// CHECK-NEXT: [[VPMINNM2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fminnmp.v8f16(<8 x half> [[A]], <8 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT: [[VPMINNM2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fminnmp.v8f16(<8 x half> [[A]], <8 x half> [[B]])
// CHECK-NEXT: ret <8 x half> [[VPMINNM2_I]]
//
float16x8_t test_vpminnmq_f16(float16x8_t a, float16x8_t b) {
@@ -1277,7 +1277,7 @@ float16x8_t test_vpminnmq_f16(float16x8_t a, float16x8_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
-// CHECK-NEXT: [[VRECPS_V2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.frecps.v4f16(<4 x half> [[A]], <4 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT: [[VRECPS_V2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.frecps.v4f16(<4 x half> [[A]], <4 x half> [[B]])
// CHECK-NEXT: [[VRECPS_V3_I:%.*]] = bitcast <4 x half> [[VRECPS_V2_I]] to <8 x i8>
// CHECK-NEXT: ret <4 x half> [[VRECPS_V2_I]]
//
@@ -1290,7 +1290,7 @@ float16x4_t test_vrecps_f16(float16x4_t a, float16x4_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
-// CHECK-NEXT: [[VRECPSQ_V2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.frecps.v8f16(<8 x half> [[A]], <8 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT: [[VRECPSQ_V2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.frecps.v8f16(<8 x half> [[A]], <8 x half> [[B]])
// CHECK-NEXT: [[VRECPSQ_V3_I:%.*]] = bitcast <8 x half> [[VRECPSQ_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <8 x half> [[VRECPSQ_V2_I]]
//
@@ -1303,7 +1303,7 @@ float16x8_t test_vrecpsq_f16(float16x8_t a, float16x8_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
-// CHECK-NEXT: [[VRSQRTS_V2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.frsqrts.v4f16(<4 x half> [[A]], <4 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT: [[VRSQRTS_V2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.frsqrts.v4f16(<4 x half> [[A]], <4 x half> [[B]])
// CHECK-NEXT: [[VRSQRTS_V3_I:%.*]] = bitcast <4 x half> [[VRSQRTS_V2_I]] to <8 x i8>
// CHECK-NEXT: ret <4 x half> [[VRSQRTS_V2_I]]
//
@@ -1316,7 +1316,7 @@ float16x4_t test_vrsqrts_f16(float16x4_t a, float16x4_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
-// CHECK-NEXT: [[VRSQRTSQ_V2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.frsqrts.v8f16(<8 x half> [[A]], <8 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT: [[VRSQRTSQ_V2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.frsqrts.v8f16(<8 x half> [[A]], <8 x half> [[B]])
// CHECK-NEXT: [[VRSQRTSQ_V3_I:%.*]] = bitcast <8 x half> [[VRSQRTSQ_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <8 x half> [[VRSQRTSQ_V2_I]]
//
@@ -1350,7 +1350,7 @@ float16x8_t test_vsubq_f16(float16x8_t a, float16x8_t b) {
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <8 x i8>
-// CHECK-NEXT: [[TMP3:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[B]], <4 x half> [[C]], <4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[TMP3:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[B]], <4 x half> [[C]], <4 x half> [[A]])
// CHECK-NEXT: ret <4 x half> [[TMP3]]
//
float16x4_t test_vfma_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
@@ -1363,7 +1363,7 @@ float16x4_t test_vfma_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <16 x i8>
-// CHECK-NEXT: [[TMP3:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[B]], <8 x half> [[C]], <8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[TMP3:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[B]], <8 x half> [[C]], <8 x half> [[A]])
// CHECK-NEXT: ret <8 x half> [[TMP3]]
//
float16x8_t test_vfmaq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
@@ -1377,7 +1377,7 @@ float16x8_t test_vfmaq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[FNEG_I]] to <8 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <8 x i8>
-// CHECK-NEXT: [[TMP3:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[FNEG_I]], <4 x half> [[C]], <4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[TMP3:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[FNEG_I]], <4 x half> [[C]], <4 x half> [[A]])
// CHECK-NEXT: ret <4 x half> [[TMP3]]
//
float16x4_t test_vfms_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
@@ -1391,7 +1391,7 @@ float16x4_t test_vfms_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[FNEG_I]] to <16 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <16 x i8>
-// CHECK-NEXT: [[TMP3:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[FNEG_I]], <8 x half> [[C]], <8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[TMP3:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[FNEG_I]], <8 x half> [[C]], <8 x half> [[A]])
// CHECK-NEXT: ret <8 x half> [[TMP3]]
//
float16x8_t test_vfmsq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
@@ -1476,7 +1476,7 @@ float16x8_t test_vfmaq_laneq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[VECINIT3]] to <8 x i8>
-// CHECK-NEXT: [[TMP3:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[B]], <4 x half> [[VECINIT3]], <4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[TMP3:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[B]], <4 x half> [[VECINIT3]], <4 x half> [[A]])
// CHECK-NEXT: ret <4 x half> [[TMP3]]
//
float16x4_t test_vfma_n_f16(float16x4_t a, float16x4_t b, float16_t c) {
@@ -1497,7 +1497,7 @@ float16x4_t test_vfma_n_f16(float16x4_t a, float16x4_t b, float16_t c) {
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[VECINIT7]] to <16 x i8>
-// CHECK-NEXT: [[TMP3:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[B]], <8 x half> [[VECINIT7]], <8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[TMP3:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[B]], <8 x half> [[VECINIT7]], <8 x half> [[A]])
// CHECK-NEXT: ret <8 x half> [[TMP3]]
//
float16x8_t test_vfmaq_n_f16(float16x8_t a, float16x8_t b, float16_t c) {
@@ -1609,7 +1609,7 @@ float16x8_t test_vfmsq_laneq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[FNEG]] to <8 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[VECINIT3]] to <8 x i8>
-// CHECK-NEXT: [[TMP3:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[FNEG]], <4 x half> [[VECINIT3]], <4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[TMP3:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[FNEG]], <4 x half> [[VECINIT3]], <4 x half> [[A]])
// CHECK-NEXT: ret <4 x half> [[TMP3]]
//
float16x4_t test_vfms_n_f16(float16x4_t a, float16x4_t b, float16_t c) {
@@ -1631,7 +1631,7 @@ float16x4_t test_vfms_n_f16(float16x4_t a, float16x4_t b, float16_t c) {
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[FNEG]] to <16 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[VECINIT7]] to <16 x i8>
-// CHECK-NEXT: [[TMP3:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[FNEG]], <8 x half> [[VECINIT7]], <8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT: [[TMP3:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[FNEG]], <8 x half> [[VECINIT7]], <8 x half> [[A]])
// CHECK-NEXT: ret <8 x half> [[TMP3]]
//
float16x8_t test_vfmsq_n_f16(float16x8_t a, float16x8_t b, float16_t c) {
@@ -1803,7 +1803,7 @@ float16_t test_vmulh_laneq_f16(float16_t a, float16x8_t b) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x half> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> [[A]], <4 x half> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> [[A]], <4 x half> [[LANE]])
// CHECK-NEXT: ret <4 x half> [[VMULX2_I]]
//
float16x4_t test_vmulx_lane_f16(float16x4_t a, float16x4_t b) {
@@ -1818,7 +1818,7 @@ float16x4_t test_vmulx_lane_f16(float16x4_t a, float16x4_t b) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x half> [[LANE]] to <16 x i8>
-// CHECK-NEXT: [[VMULX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> [[A]], <8 x half> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> [[A]], <8 x half> [[LANE]])
// CHECK-NEXT: ret <8 x half> [[VMULX2_I]]
//
float16x8_t test_vmulxq_lane_f16(float16x8_t a, float16x4_t b) {
@@ -1833,7 +1833,7 @@ float16x8_t test_vmulxq_lane_f16(float16x8_t a, float16x4_t b) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x half> [[LANE]] to <8 x i8>
-// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> [[A]], <4 x half> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> [[A]], <4 x half> [[LANE]])
// CHECK-NEXT: ret <4 x half> [[VMULX2_I]]
//
float16x4_t test_vmulx_laneq_f16(float16x4_t a, float16x8_t b) {
@@ -1848,7 +1848,7 @@ float16x4_t test_vmulx_laneq_f16(float16x4_t a, float16x8_t b) {
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x half> [[LANE]] to <16 x i8>
-// CHECK-NEXT: [[VMULX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> [[A]], <8 x half> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> [[A]], <8 x half> [[LANE]])
// CHECK-NEXT: ret <8 x half> [[VMULX2_I]]
//
float16x8_t test_vmulxq_laneq_f16(float16x8_t a, float16x8_t b) {
@@ -1864,7 +1864,7 @@ float16x8_t test_vmulxq_laneq_f16(float16x8_t a, float16x8_t b) {
// CHECK-NEXT: [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[B]], i32 3
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[VECINIT3]] to <8 x i8>
-// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> [[A]], <4 x half> [[VECINIT3]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> [[A]], <4 x half> [[VECINIT3]])
// CHECK-NEXT: ret <4 x half> [[VMULX2_I]]
//
float16x4_t test_vmulx_n_f16(float16x4_t a, float16_t b) {
@@ -1884,7 +1884,7 @@ float16x4_t test_vmulx_n_f16(float16x4_t a, float16_t b) {
// CHECK-NEXT: [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[B]], i32 7
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[VECINIT7]] to <16 x i8>
-// CHECK-NEXT: [[VMULX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> [[A]], <8 x half> [[VECINIT7]]) #[[ATTR4]]
+// CHECK-NEXT: [[VMULX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> [[A]], <8 x half> [[VECINIT7]])
// CHECK-NEXT: ret <8 x half> [[VMULX2_I]]
//
float16x8_t test_vmulxq_n_f16(float16x8_t a, float16_t b) {
diff --git a/clang/test/CodeGen/arm-bf16-convert-intrinsics.c b/clang/test/CodeGen/arm-bf16-convert-intrinsics.c
index d40504e3978cf..9280a2b986f9e 100644
--- a/clang/test/CodeGen/arm-bf16-convert-intrinsics.c
+++ b/clang/test/CodeGen/arm-bf16-convert-intrinsics.c
@@ -262,7 +262,7 @@ float32x4_t test_vcvtq_high_f32_bf16(bfloat16x8_t a) {
// CHECK-A64-LABEL: @test_vcvt_bf16_f32(
// CHECK-A64-NEXT: entry:
// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_V1_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float> [[A]]) #[[ATTR3:[0-9]+]]
+// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_V1_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float> [[A]])
// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_V2_I:%.*]] = bitcast <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_V1_I]] to <16 x i8>
// CHECK-A64-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_V1_I]], <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_V1_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
// CHECK-A64-NEXT: ret <4 x bfloat> [[SHUFFLE_I]]
@@ -270,7 +270,7 @@ float32x4_t test_vcvtq_high_f32_bf16(bfloat16x8_t a) {
// CHECK-A32-HARDFP-LABEL: @test_vcvt_bf16_f32(
// CHECK-A32-HARDFP-NEXT: entry:
// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[A]]) #[[ATTR3:[0-9]+]]
+// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[A]])
// CHECK-A32-HARDFP-NEXT: ret <4 x bfloat> [[VCVTFP2BF1_I]]
//
// CHECK-A32-SOFTFP-LABEL: @test_vcvt_bf16_f32(
@@ -281,7 +281,7 @@ float32x4_t test_vcvtq_high_f32_bf16(bfloat16x8_t a) {
// CHECK-A32-SOFTFP-NEXT: [[RETVAL:%.*]] = alloca <4 x bfloat>, align 8
// CHECK-A32-SOFTFP-NEXT: [[COERCE:%.*]] = alloca <4 x bfloat>, align 8
// CHECK-A32-SOFTFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[A]]) #[[ATTR3:[0-9]+]]
+// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[A]])
// CHECK-A32-SOFTFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat>
// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[TMP1]], <4 x bfloat>* [[RETVAL_I1]], align 8
// CHECK-A32-SOFTFP-NEXT: [[TMP2:%.*]] = bitcast <4 x bfloat>* [[RETVAL_I1]] to <2 x i32>*
@@ -307,14 +307,14 @@ bfloat16x4_t test_vcvt_bf16_f32(float32x4_t a) {
// CHECK-A64-LABEL: @test_vcvtq_low_bf16_f32(
// CHECK-A64-NEXT: entry:
// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_V1_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float> [[A]]) #[[ATTR3]]
+// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_V1_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float> [[A]])
// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_V2_I:%.*]] = bitcast <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_V1_I]] to <16 x i8>
// CHECK-A64-NEXT: ret <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_V1_I]]
//
// CHECK-A32-HARDFP-LABEL: @test_vcvtq_low_bf16_f32(
// CHECK-A32-HARDFP-NEXT: entry:
// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[A]]) #[[ATTR3]]
+// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[A]])
// CHECK-A32-HARDFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x bfloat> zeroinitializer, <4 x bfloat> [[VCVTFP2BF1_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
// CHECK-A32-HARDFP-NEXT: ret <8 x bfloat> [[SHUFFLE_I]]
//
@@ -332,7 +332,7 @@ bfloat16x4_t test_vcvt_bf16_f32(float32x4_t a) {
// CHECK-A32-SOFTFP-NEXT: [[RETVAL:%.*]] = alloca <8 x bfloat>, align 8
// CHECK-A32-SOFTFP-NEXT: [[COERCE:%.*]] = alloca <8 x bfloat>, align 8
// CHECK-A32-SOFTFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[A]]) #[[ATTR3]]
+// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[A]])
// CHECK-A32-SOFTFP-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat>
// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[TMP1]], <4 x bfloat>* [[RETVAL_I1]], align 8
// CHECK-A32-SOFTFP-NEXT: [[TMP2:%.*]] = bitcast <4 x bfloat>* [[RETVAL_I1]] to <2 x i32>*
@@ -378,14 +378,14 @@ bfloat16x8_t test_vcvtq_low_bf16_f32(float32x4_t a) {
// CHECK-A64-NEXT: entry:
// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[INACTIVE:%.*]] to <16 x i8>
// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A64-NEXT: [[VCVTQ_HIGH_BF16_V2_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn2(<8 x bfloat> [[INACTIVE]], <4 x float> [[A]]) #[[ATTR3]]
+// CHECK-A64-NEXT: [[VCVTQ_HIGH_BF16_V2_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn2(<8 x bfloat> [[INACTIVE]], <4 x float> [[A]])
// CHECK-A64-NEXT: [[VCVTQ_HIGH_BF16_V3_I:%.*]] = bitcast <8 x bfloat> [[VCVTQ_HIGH_BF16_V2_I]] to <16 x i8>
// CHECK-A64-NEXT: ret <8 x bfloat> [[VCVTQ_HIGH_BF16_V2_I]]
//
// CHECK-A32-HARDFP-LABEL: @test_vcvtq_high_bf16_f32(
// CHECK-A32-HARDFP-NEXT: entry:
// CHECK-A32-HARDFP-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[A]]) #[[ATTR3]]
+// CHECK-A32-HARDFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[A]])
// CHECK-A32-HARDFP-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[INACTIVE:%.*]], <8 x bfloat> [[INACTIVE]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
// CHECK-A32-HARDFP-NEXT: [[SHUFFLE_I8:%.*]] = shufflevector <4 x bfloat> [[VCVTFP2BF1_I]], <4 x bfloat> [[SHUFFLE_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
// CHECK-A32-HARDFP-NEXT: ret <8 x bfloat> [[SHUFFLE_I8]]
@@ -420,7 +420,7 @@ bfloat16x8_t test_vcvtq_low_bf16_f32(float32x4_t a) {
// CHECK-A32-SOFTFP-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 8
// CHECK-A32-SOFTFP-NEXT: [[__P01_I:%.*]] = load <8 x bfloat>, <8 x bfloat>* [[__P0_I]], align 8
// CHECK-A32-SOFTFP-NEXT: [[TMP4:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[A]]) #[[ATTR3]]
+// CHECK-A32-SOFTFP-NEXT: [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[A]])
// CHECK-A32-SOFTFP-NEXT: [[TMP5:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat>
// CHECK-A32-SOFTFP-NEXT: store <4 x bfloat> [[TMP5]], <4 x bfloat>* [[RETVAL_I8]], align 8
// CHECK-A32-SOFTFP-NEXT: [[TMP6:%.*]] = bitcast <4 x bfloat>* [[RETVAL_I8]] to <2 x i32>*
@@ -477,17 +477,17 @@ bfloat16x8_t test_vcvtq_high_bf16_f32(bfloat16x8_t inactive, float32x4_t a) {
// CHECK-A64-LABEL: @test_vcvth_bf16_f32(
// CHECK-A64-NEXT: entry:
-// CHECK-A64-NEXT: [[VCVTH_BF16_F32_I:%.*]] = call bfloat @llvm.aarch64.neon.bfcvt(float [[A:%.*]]) #[[ATTR3]]
+// CHECK-A64-NEXT: [[VCVTH_BF16_F32_I:%.*]] = call bfloat @llvm.aarch64.neon.bfcvt(float [[A:%.*]])
// CHECK-A64-NEXT: ret bfloat [[VCVTH_BF16_F32_I]]
//
// CHECK-A32-HARDFP-LABEL: @test_vcvth_bf16_f32(
// CHECK-A32-HARDFP-NEXT: entry:
-// CHECK-A32-HARDFP-NEXT: [[VCVTBFP2BF_I:%.*]] = call bfloat @llvm.arm.neon.vcvtbfp2bf(float [[A:%.*]]) #[[ATTR3]]
+// CHECK-A32-HARDFP-NEXT: [[VCVTBFP2BF_I:%.*]] = call bfloat @llvm.arm.neon.vcvtbfp2bf(float [[A:%.*]])
// CHECK-A32-HARDFP-NEXT: ret bfloat [[VCVTBFP2BF_I]]
//
// CHECK-A32-SOFTFP-LABEL: @test_vcvth_bf16_f32(
// CHECK-A32-SOFTFP-NEXT: entry:
-// CHECK-A32-SOFTFP-NEXT: [[VCVTBFP2BF_I:%.*]] = call bfloat @llvm.arm.neon.vcvtbfp2bf(float [[A:%.*]]) #[[ATTR3]]
+// CHECK-A32-SOFTFP-NEXT: [[VCVTBFP2BF_I:%.*]] = call bfloat @llvm.arm.neon.vcvtbfp2bf(float [[A:%.*]])
// CHECK-A32-SOFTFP-NEXT: ret bfloat [[VCVTBFP2BF_I]]
//
bfloat16_t test_vcvth_bf16_f32(float32_t a) {
diff --git a/clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c b/clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c
index fbecb0dadac2e..a190081ec90dd 100644
--- a/clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c
+++ b/clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c
@@ -17,7 +17,7 @@
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[R:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> [[R]], <4 x bfloat> [[A]], <4 x bfloat> [[B]]) #[[ATTR3:[0-9]+]]
+// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> [[R]], <4 x bfloat> [[A]], <4 x bfloat> [[B]])
// CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]]
//
float32x2_t test_vbfdot_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) {
@@ -29,7 +29,7 @@ float32x2_t test_vbfdot_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) {
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]]) #[[ATTR3]]
+// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]])
// CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]]
//
float32x4_t test_vbfdotq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b){
@@ -52,7 +52,7 @@ float32x4_t test_vbfdotq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b){
// CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x float> [[R:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x bfloat> [[TMP5]] to <8 x i8>
-// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> [[R]], <4 x bfloat> [[A]], <4 x bfloat> [[TMP5]]) #[[ATTR3]]
+// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> [[R]], <4 x bfloat> [[A]], <4 x bfloat> [[TMP5]])
// CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]]
//
float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){
@@ -75,7 +75,7 @@ float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){
// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x bfloat> [[TMP5]] to <16 x i8>
-// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[TMP5]]) #[[ATTR3]]
+// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[TMP5]])
// CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]]
//
float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
@@ -98,7 +98,7 @@ float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b
// CHECK-NEXT: [[TMP6:%.*]] = bitcast <2 x float> [[R:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8>
// CHECK-NEXT: [[TMP8:%.*]] = bitcast <4 x bfloat> [[TMP5]] to <8 x i8>
-// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> [[R]], <4 x bfloat> [[A]], <4 x bfloat> [[TMP5]]) #[[ATTR3]]
+// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> [[R]], <4 x bfloat> [[A]], <4 x bfloat> [[TMP5]])
// CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]]
//
float32x2_t test_vbfdot_laneq_f32(float32x2_t r, bfloat16x4_t a, bfloat16x8_t b) {
@@ -121,7 +121,7 @@ float32x2_t test_vbfdot_laneq_f32(float32x2_t r, bfloat16x4_t a, bfloat16x8_t b)
// CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x bfloat> [[TMP5]] to <16 x i8>
-// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[TMP5]]) #[[ATTR3]]
+// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[TMP5]])
// CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]]
//
float32x4_t test_vbfdotq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {
@@ -133,7 +133,7 @@ float32x4_t test_vbfdotq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT: [[VBFMMLAQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmmla(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]]) #[[ATTR3]]
+// CHECK-NEXT: [[VBFMMLAQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmmla(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]])
// CHECK-NEXT: [[VBFMMLAQ_V4_I:%.*]] = bitcast <4 x float> [[VBFMMLAQ_V3_I]] to <16 x i8>
// CHECK-NEXT: ret <4 x float> [[VBFMMLAQ_V3_I]]
//
@@ -146,7 +146,7 @@ float32x4_t test_vbfmmlaq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT: [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]]) #[[ATTR3]]
+// CHECK-NEXT: [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]])
// CHECK-NEXT: [[VBFMLALBQ_V4_I:%.*]] = bitcast <4 x float> [[VBFMLALBQ_V3_I]] to <16 x i8>
// CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_V3_I]]
//
@@ -159,7 +159,7 @@ float32x4_t test_vbfmlalbq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT: [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]]) #[[ATTR3]]
+// CHECK-NEXT: [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]])
// CHECK-NEXT: [[VBFMLALTQ_V4_I:%.*]] = bitcast <4 x float> [[VBFMLALTQ_V3_I]] to <16 x i8>
// CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_V3_I]]
//
@@ -188,7 +188,7 @@ float32x4_t test_vbfmlaltq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8>
-// CHECK-NEXT: [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]]) #[[ATTR3]]
+// CHECK-NEXT: [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]])
// CHECK-NEXT: [[VBFMLALBQ_V4_I:%.*]] = bitcast <4 x float> [[VBFMLALBQ_V3_I]] to <16 x i8>
// CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_V3_I]]
//
@@ -217,7 +217,7 @@ float32x4_t test_vbfmlalbq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8>
-// CHECK-NEXT: [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]]) #[[ATTR3]]
+// CHECK-NEXT: [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]])
// CHECK-NEXT: [[VBFMLALBQ_V4_I:%.*]] = bitcast <4 x float> [[VBFMLALBQ_V3_I]] to <16 x i8>
// CHECK-NEXT: ret <4 x float> [[VBFMLALBQ_V3_I]]
//
@@ -246,7 +246,7 @@ float32x4_t test_vbfmlalbq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8>
-// CHECK-NEXT: [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]]) #[[ATTR3]]
+// CHECK-NEXT: [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]])
// CHECK-NEXT: [[VBFMLALTQ_V4_I:%.*]] = bitcast <4 x float> [[VBFMLALTQ_V3_I]] to <16 x i8>
// CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_V3_I]]
//
@@ -275,7 +275,7 @@ float32x4_t test_vbfmlaltq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8>
-// CHECK-NEXT: [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]]) #[[ATTR3]]
+// CHECK-NEXT: [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]])
// CHECK-NEXT: [[VBFMLALTQ_V4_I:%.*]] = bitcast <4 x float> [[VBFMLALTQ_V3_I]] to <16 x i8>
// CHECK-NEXT: ret <4 x float> [[VBFMLALTQ_V3_I]]
//
diff --git a/clang/test/CodeGen/arm-neon-fma.c b/clang/test/CodeGen/arm-neon-fma.c
index cd534f9cd1879..03ab0dbc7fdb4 100644
--- a/clang/test/CodeGen/arm-neon-fma.c
+++ b/clang/test/CodeGen/arm-neon-fma.c
@@ -16,7 +16,7 @@
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[ACCUM]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[LHS]] to <8 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[RHS]] to <8 x i8>
-// CHECK-NEXT: [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LHS]], <2 x float> [[RHS]], <2 x float> [[ACCUM]]) #[[ATTR3:[0-9]+]]
+// CHECK-NEXT: [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LHS]], <2 x float> [[RHS]], <2 x float> [[ACCUM]])
// CHECK-NEXT: ret <2 x float> [[TMP3]]
//
float32x2_t test_fma_order(float32x2_t accum, float32x2_t lhs, float32x2_t rhs) {
@@ -29,7 +29,7 @@ float32x2_t test_fma_order(float32x2_t accum, float32x2_t lhs, float32x2_t rhs)
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[ACCUM]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[LHS]] to <16 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[RHS]] to <16 x i8>
-// CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LHS]], <4 x float> [[RHS]], <4 x float> [[ACCUM]]) #[[ATTR3]]
+// CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LHS]], <4 x float> [[RHS]], <4 x float> [[ACCUM]])
// CHECK-NEXT: ret <4 x float> [[TMP3]]
//
float32x4_t test_fmaq_order(float32x4_t accum, float32x4_t lhs, float32x4_t rhs) {
@@ -44,7 +44,7 @@ float32x4_t test_fmaq_order(float32x4_t accum, float32x4_t lhs, float32x4_t rhs)
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8>
-// CHECK-NEXT: [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[B]], <2 x float> [[VECINIT1_I]], <2 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[B]], <2 x float> [[VECINIT1_I]], <2 x float> [[A]])
// CHECK-NEXT: ret <2 x float> [[TMP3]]
//
float32x2_t test_vfma_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
@@ -61,7 +61,7 @@ float32x2_t test_vfma_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <16 x i8>
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8>
-// CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[B]], <4 x float> [[VECINIT3_I]], <4 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[B]], <4 x float> [[VECINIT3_I]], <4 x float> [[A]])
// CHECK-NEXT: ret <4 x float> [[TMP3]]
//
float32x4_t test_vfmaq_n_f32(float32x4_t a, float32x4_t b, float32_t n) {
diff --git a/clang/test/CodeGen/arm-neon-numeric-maxmin.c b/clang/test/CodeGen/arm-neon-numeric-maxmin.c
index 849f5adebeda4..b53c0b8126458 100644
--- a/clang/test/CodeGen/arm-neon-numeric-maxmin.c
+++ b/clang/test/CodeGen/arm-neon-numeric-maxmin.c
@@ -10,7 +10,7 @@
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
-// CHECK-NEXT: [[VMAXNM_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vmaxnm.v2f32(<2 x float> [[A]], <2 x float> [[B]]) #[[ATTR3:[0-9]+]]
+// CHECK-NEXT: [[VMAXNM_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vmaxnm.v2f32(<2 x float> [[A]], <2 x float> [[B]])
// CHECK-NEXT: [[VMAXNM_V3_I:%.*]] = bitcast <2 x float> [[VMAXNM_V2_I]] to <8 x i8>
// CHECK-NEXT: ret <2 x float> [[VMAXNM_V2_I]]
//
@@ -23,7 +23,7 @@ float32x2_t test_vmaxnm_f32(float32x2_t a, float32x2_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <16 x i8>
-// CHECK-NEXT: [[VMAXNMQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vmaxnm.v4f32(<4 x float> [[A]], <4 x float> [[B]]) #[[ATTR3]]
+// CHECK-NEXT: [[VMAXNMQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vmaxnm.v4f32(<4 x float> [[A]], <4 x float> [[B]])
// CHECK-NEXT: [[VMAXNMQ_V3_I:%.*]] = bitcast <4 x float> [[VMAXNMQ_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <4 x float> [[VMAXNMQ_V2_I]]
//
@@ -36,7 +36,7 @@ float32x4_t test_vmaxnmq_f32(float32x4_t a, float32x4_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
-// CHECK-NEXT: [[VMINNM_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vminnm.v2f32(<2 x float> [[A]], <2 x float> [[B]]) #[[ATTR3]]
+// CHECK-NEXT: [[VMINNM_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vminnm.v2f32(<2 x float> [[A]], <2 x float> [[B]])
// CHECK-NEXT: [[VMINNM_V3_I:%.*]] = bitcast <2 x float> [[VMINNM_V2_I]] to <8 x i8>
// CHECK-NEXT: ret <2 x float> [[VMINNM_V2_I]]
//
@@ -49,7 +49,7 @@ float32x2_t test_vminnm_f32(float32x2_t a, float32x2_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <16 x i8>
-// CHECK-NEXT: [[VMINNMQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vminnm.v4f32(<4 x float> [[A]], <4 x float> [[B]]) #[[ATTR3]]
+// CHECK-NEXT: [[VMINNMQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vminnm.v4f32(<4 x float> [[A]], <4 x float> [[B]])
// CHECK-NEXT: [[VMINNMQ_V3_I:%.*]] = bitcast <4 x float> [[VMINNMQ_V2_I]] to <16 x i8>
// CHECK-NEXT: ret <4 x float> [[VMINNMQ_V2_I]]
//
diff --git a/clang/test/CodeGen/arm-neon-vcvtX.c b/clang/test/CodeGen/arm-neon-vcvtX.c
index 71f8e5bca79da..95a6e2dff57a9 100644
--- a/clang/test/CodeGen/arm-neon-vcvtX.c
+++ b/clang/test/CodeGen/arm-neon-vcvtX.c
@@ -9,7 +9,7 @@
// CHECK-SAME: (<2 x float> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
-// CHECK-NEXT: [[VCVTA_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtas.v2i32.v2f32(<2 x float> [[A]]) #[[ATTR3:[0-9]+]]
+// CHECK-NEXT: [[VCVTA_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtas.v2i32.v2f32(<2 x float> [[A]])
// CHECK-NEXT: ret <2 x i32> [[VCVTA_S32_V1_I]]
//
int32x2_t test_vcvta_s32_f32(float32x2_t a) {
@@ -20,7 +20,7 @@ int32x2_t test_vcvta_s32_f32(float32x2_t a) {
// CHECK-SAME: (<2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
-// CHECK-NEXT: [[VCVTA_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtau.v2i32.v2f32(<2 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VCVTA_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtau.v2i32.v2f32(<2 x float> [[A]])
// CHECK-NEXT: ret <2 x i32> [[VCVTA_U32_V1_I]]
//
uint32x2_t test_vcvta_u32_f32(float32x2_t a) {
@@ -31,7 +31,7 @@ uint32x2_t test_vcvta_u32_f32(float32x2_t a) {
// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1:[0-9]+]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
-// CHECK-NEXT: [[VCVTAQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtas.v4i32.v4f32(<4 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VCVTAQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtas.v4i32.v4f32(<4 x float> [[A]])
// CHECK-NEXT: ret <4 x i32> [[VCVTAQ_S32_V1_I]]
//
int32x4_t test_vcvtaq_s32_f32(float32x4_t a) {
@@ -42,7 +42,7 @@ int32x4_t test_vcvtaq_s32_f32(float32x4_t a) {
// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
-// CHECK-NEXT: [[VCVTAQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtau.v4i32.v4f32(<4 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VCVTAQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtau.v4i32.v4f32(<4 x float> [[A]])
// CHECK-NEXT: ret <4 x i32> [[VCVTAQ_U32_V1_I]]
//
uint32x4_t test_vcvtaq_u32_f32(float32x4_t a) {
@@ -53,7 +53,7 @@ uint32x4_t test_vcvtaq_u32_f32(float32x4_t a) {
// CHECK-SAME: (<2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
-// CHECK-NEXT: [[VCVTN_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtns.v2i32.v2f32(<2 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VCVTN_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtns.v2i32.v2f32(<2 x float> [[A]])
// CHECK-NEXT: ret <2 x i32> [[VCVTN_S32_V1_I]]
//
int32x2_t test_vcvtn_s32_f32(float32x2_t a) {
@@ -64,7 +64,7 @@ int32x2_t test_vcvtn_s32_f32(float32x2_t a) {
// CHECK-SAME: (<2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
-// CHECK-NEXT: [[VCVTN_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtnu.v2i32.v2f32(<2 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VCVTN_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtnu.v2i32.v2f32(<2 x float> [[A]])
// CHECK-NEXT: ret <2 x i32> [[VCVTN_U32_V1_I]]
//
uint32x2_t test_vcvtn_u32_f32(float32x2_t a) {
@@ -75,7 +75,7 @@ uint32x2_t test_vcvtn_u32_f32(float32x2_t a) {
// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
-// CHECK-NEXT: [[VCVTNQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtns.v4i32.v4f32(<4 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VCVTNQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtns.v4i32.v4f32(<4 x float> [[A]])
// CHECK-NEXT: ret <4 x i32> [[VCVTNQ_S32_V1_I]]
//
int32x4_t test_vcvtnq_s32_f32(float32x4_t a) {
@@ -86,7 +86,7 @@ int32x4_t test_vcvtnq_s32_f32(float32x4_t a) {
// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
-// CHECK-NEXT: [[VCVTNQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtnu.v4i32.v4f32(<4 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VCVTNQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtnu.v4i32.v4f32(<4 x float> [[A]])
// CHECK-NEXT: ret <4 x i32> [[VCVTNQ_U32_V1_I]]
//
uint32x4_t test_vcvtnq_u32_f32(float32x4_t a) {
@@ -97,7 +97,7 @@ uint32x4_t test_vcvtnq_u32_f32(float32x4_t a) {
// CHECK-SAME: (<2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
-// CHECK-NEXT: [[VCVTP_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtps.v2i32.v2f32(<2 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VCVTP_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtps.v2i32.v2f32(<2 x float> [[A]])
// CHECK-NEXT: ret <2 x i32> [[VCVTP_S32_V1_I]]
//
int32x2_t test_vcvtp_s32_f32(float32x2_t a) {
@@ -108,7 +108,7 @@ int32x2_t test_vcvtp_s32_f32(float32x2_t a) {
// CHECK-SAME: (<2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
-// CHECK-NEXT: [[VCVTP_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtpu.v2i32.v2f32(<2 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VCVTP_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtpu.v2i32.v2f32(<2 x float> [[A]])
// CHECK-NEXT: ret <2 x i32> [[VCVTP_U32_V1_I]]
//
uint32x2_t test_vcvtp_u32_f32(float32x2_t a) {
@@ -119,7 +119,7 @@ uint32x2_t test_vcvtp_u32_f32(float32x2_t a) {
// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
-// CHECK-NEXT: [[VCVTPQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtps.v4i32.v4f32(<4 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VCVTPQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtps.v4i32.v4f32(<4 x float> [[A]])
// CHECK-NEXT: ret <4 x i32> [[VCVTPQ_S32_V1_I]]
//
int32x4_t test_vcvtpq_s32_f32(float32x4_t a) {
@@ -130,7 +130,7 @@ int32x4_t test_vcvtpq_s32_f32(float32x4_t a) {
// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
-// CHECK-NEXT: [[VCVTPQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtpu.v4i32.v4f32(<4 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VCVTPQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtpu.v4i32.v4f32(<4 x float> [[A]])
// CHECK-NEXT: ret <4 x i32> [[VCVTPQ_U32_V1_I]]
//
uint32x4_t test_vcvtpq_u32_f32(float32x4_t a) {
@@ -141,7 +141,7 @@ uint32x4_t test_vcvtpq_u32_f32(float32x4_t a) {
// CHECK-SAME: (<2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
-// CHECK-NEXT: [[VCVTM_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtms.v2i32.v2f32(<2 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VCVTM_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtms.v2i32.v2f32(<2 x float> [[A]])
// CHECK-NEXT: ret <2 x i32> [[VCVTM_S32_V1_I]]
//
int32x2_t test_vcvtm_s32_f32(float32x2_t a) {
@@ -152,7 +152,7 @@ int32x2_t test_vcvtm_s32_f32(float32x2_t a) {
// CHECK-SAME: (<2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
-// CHECK-NEXT: [[VCVTM_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtmu.v2i32.v2f32(<2 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VCVTM_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtmu.v2i32.v2f32(<2 x float> [[A]])
// CHECK-NEXT: ret <2 x i32> [[VCVTM_U32_V1_I]]
//
uint32x2_t test_vcvtm_u32_f32(float32x2_t a) {
@@ -163,7 +163,7 @@ uint32x2_t test_vcvtm_u32_f32(float32x2_t a) {
// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
-// CHECK-NEXT: [[VCVTMQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtms.v4i32.v4f32(<4 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VCVTMQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtms.v4i32.v4f32(<4 x float> [[A]])
// CHECK-NEXT: ret <4 x i32> [[VCVTMQ_S32_V1_I]]
//
int32x4_t test_vcvtmq_s32_f32(float32x4_t a) {
@@ -174,7 +174,7 @@ int32x4_t test_vcvtmq_s32_f32(float32x4_t a) {
// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
-// CHECK-NEXT: [[VCVTMQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtmu.v4i32.v4f32(<4 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT: [[VCVTMQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtmu.v4i32.v4f32(<4 x float> [[A]])
// CHECK-NEXT: ret <4 x i32> [[VCVTMQ_U32_V1_I]]
//
uint32x4_t test_vcvtmq_u32_f32(float32x4_t a) {
diff --git a/clang/test/CodeGen/arm-v8.1a-neon-intrinsics.c b/clang/test/CodeGen/arm-v8.1a-neon-intrinsics.c
index dc007927be1c1..e516a6d16f54f 100644
--- a/clang/test/CodeGen/arm-v8.1a-neon-intrinsics.c
+++ b/clang/test/CodeGen/arm-v8.1a-neon-intrinsics.c
@@ -13,12 +13,12 @@
// CHECK-ARM-LABEL: @test_vqrdmlah_s16(
// CHECK-ARM-NEXT: entry:
-// CHECK-ARM-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) #[[ATTR3:[0-9]+]]
+// CHECK-ARM-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]])
// CHECK-ARM-NEXT: ret <4 x i16> [[VQRDMLAH_V3_I]]
//
// CHECK-AARCH64-LABEL: @test_vqrdmlah_s16(
// CHECK-AARCH64-NEXT: entry:
-// CHECK-AARCH64-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) #[[ATTR3:[0-9]+]]
+// CHECK-AARCH64-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]])
// CHECK-AARCH64-NEXT: ret <4 x i16> [[VQRDMLAH_V3_I]]
//
int16x4_t test_vqrdmlah_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
@@ -28,12 +28,12 @@ int16x4_t test_vqrdmlah_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
// CHECK-ARM-LABEL: @test_vqrdmlah_s32(
// CHECK-ARM-NEXT: entry:
-// CHECK-ARM-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]]) #[[ATTR3]]
+// CHECK-ARM-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]])
// CHECK-ARM-NEXT: ret <2 x i32> [[VQRDMLAH_V3_I]]
//
// CHECK-AARCH64-LABEL: @test_vqrdmlah_s32(
// CHECK-AARCH64-NEXT: entry:
-// CHECK-AARCH64-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]])
// CHECK-AARCH64-NEXT: ret <2 x i32> [[VQRDMLAH_V3_I]]
//
int32x2_t test_vqrdmlah_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
@@ -43,12 +43,12 @@ int32x2_t test_vqrdmlah_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
// CHECK-ARM-LABEL: @test_vqrdmlahq_s16(
// CHECK-ARM-NEXT: entry:
-// CHECK-ARM-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR3]]
+// CHECK-ARM-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]])
// CHECK-ARM-NEXT: ret <8 x i16> [[VQRDMLAHQ_V3_I]]
//
// CHECK-AARCH64-LABEL: @test_vqrdmlahq_s16(
// CHECK-AARCH64-NEXT: entry:
-// CHECK-AARCH64-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]])
// CHECK-AARCH64-NEXT: ret <8 x i16> [[VQRDMLAHQ_V3_I]]
//
int16x8_t test_vqrdmlahq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
@@ -58,12 +58,12 @@ int16x8_t test_vqrdmlahq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
// CHECK-ARM-LABEL: @test_vqrdmlahq_s32(
// CHECK-ARM-NEXT: entry:
-// CHECK-ARM-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR3]]
+// CHECK-ARM-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]])
// CHECK-ARM-NEXT: ret <4 x i32> [[VQRDMLAHQ_V3_I]]
//
// CHECK-AARCH64-LABEL: @test_vqrdmlahq_s32(
// CHECK-AARCH64-NEXT: entry:
-// CHECK-AARCH64-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]])
// CHECK-AARCH64-NEXT: ret <4 x i32> [[VQRDMLAHQ_V3_I]]
//
int32x4_t test_vqrdmlahq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
@@ -76,7 +76,7 @@ int32x4_t test_vqrdmlahq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
// CHECK-ARM-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
// CHECK-ARM-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-ARM-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-ARM-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) #[[ATTR3]]
+// CHECK-ARM-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]])
// CHECK-ARM-NEXT: ret <4 x i16> [[VQRDMLAH_V3_I]]
//
// CHECK-AARCH64-LABEL: @test_vqrdmlah_lane_s16(
@@ -84,7 +84,7 @@ int32x4_t test_vqrdmlahq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
// CHECK-AARCH64-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
// CHECK-AARCH64-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-AARCH64-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-AARCH64-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]])
// CHECK-AARCH64-NEXT: ret <4 x i16> [[VQRDMLAH_V3_I]]
//
int16x4_t test_vqrdmlah_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
@@ -97,7 +97,7 @@ int16x4_t test_vqrdmlah_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
// CHECK-ARM-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
// CHECK-ARM-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-ARM-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK-ARM-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) #[[ATTR3]]
+// CHECK-ARM-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]])
// CHECK-ARM-NEXT: ret <2 x i32> [[VQRDMLAH_V3_I]]
//
// CHECK-AARCH64-LABEL: @test_vqrdmlah_lane_s32(
@@ -105,7 +105,7 @@ int16x4_t test_vqrdmlah_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
// CHECK-AARCH64-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
// CHECK-AARCH64-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-AARCH64-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK-AARCH64-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT: [[VQRDMLAH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]])
// CHECK-AARCH64-NEXT: ret <2 x i32> [[VQRDMLAH_V3_I]]
//
int32x2_t test_vqrdmlah_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
@@ -118,7 +118,7 @@ int32x2_t test_vqrdmlah_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
// CHECK-ARM-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
// CHECK-ARM-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-ARM-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK-ARM-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) #[[ATTR3]]
+// CHECK-ARM-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]])
// CHECK-ARM-NEXT: ret <8 x i16> [[VQRDMLAHQ_V3_I]]
//
// CHECK-AARCH64-LABEL: @test_vqrdmlahq_lane_s16(
@@ -126,7 +126,7 @@ int32x2_t test_vqrdmlah_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
// CHECK-AARCH64-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
// CHECK-AARCH64-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-AARCH64-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK-AARCH64-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]])
// CHECK-AARCH64-NEXT: ret <8 x i16> [[VQRDMLAHQ_V3_I]]
//
int16x8_t test_vqrdmlahq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) {
@@ -139,7 +139,7 @@ int16x8_t test_vqrdmlahq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) {
// CHECK-ARM-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
// CHECK-ARM-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-ARM-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK-ARM-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) #[[ATTR3]]
+// CHECK-ARM-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]])
// CHECK-ARM-NEXT: ret <4 x i32> [[VQRDMLAHQ_V3_I]]
//
// CHECK-AARCH64-LABEL: @test_vqrdmlahq_lane_s32(
@@ -147,7 +147,7 @@ int16x8_t test_vqrdmlahq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) {
// CHECK-AARCH64-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
// CHECK-AARCH64-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-AARCH64-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK-AARCH64-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT: [[VQRDMLAHQ_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]])
// CHECK-AARCH64-NEXT: ret <4 x i32> [[VQRDMLAHQ_V3_I]]
//
int32x4_t test_vqrdmlahq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) {
@@ -157,12 +157,12 @@ int32x4_t test_vqrdmlahq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) {
// CHECK-ARM-LABEL: @test_vqrdmlsh_s16(
// CHECK-ARM-NEXT: entry:
-// CHECK-ARM-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) #[[ATTR3]]
+// CHECK-ARM-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]])
// CHECK-ARM-NEXT: ret <4 x i16> [[VQRDMLSH_V3_I]]
//
// CHECK-AARCH64-LABEL: @test_vqrdmlsh_s16(
// CHECK-AARCH64-NEXT: entry:
-// CHECK-AARCH64-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]])
// CHECK-AARCH64-NEXT: ret <4 x i16> [[VQRDMLSH_V3_I]]
//
int16x4_t test_vqrdmlsh_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
@@ -172,12 +172,12 @@ int16x4_t test_vqrdmlsh_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
// CHECK-ARM-LABEL: @test_vqrdmlsh_s32(
// CHECK-ARM-NEXT: entry:
-// CHECK-ARM-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]]) #[[ATTR3]]
+// CHECK-ARM-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]])
// CHECK-ARM-NEXT: ret <2 x i32> [[VQRDMLSH_V3_I]]
//
// CHECK-AARCH64-LABEL: @test_vqrdmlsh_s32(
// CHECK-AARCH64-NEXT: entry:
-// CHECK-AARCH64-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]])
// CHECK-AARCH64-NEXT: ret <2 x i32> [[VQRDMLSH_V3_I]]
//
int32x2_t test_vqrdmlsh_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
@@ -187,12 +187,12 @@ int32x2_t test_vqrdmlsh_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
// CHECK-ARM-LABEL: @test_vqrdmlshq_s16(
// CHECK-ARM-NEXT: entry:
-// CHECK-ARM-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR3]]
+// CHECK-ARM-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]])
// CHECK-ARM-NEXT: ret <8 x i16> [[VQRDMLSHQ_V3_I]]
//
// CHECK-AARCH64-LABEL: @test_vqrdmlshq_s16(
// CHECK-AARCH64-NEXT: entry:
-// CHECK-AARCH64-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]])
// CHECK-AARCH64-NEXT: ret <8 x i16> [[VQRDMLSHQ_V3_I]]
//
int16x8_t test_vqrdmlshq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
@@ -202,12 +202,12 @@ int16x8_t test_vqrdmlshq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
// CHECK-ARM-LABEL: @test_vqrdmlshq_s32(
// CHECK-ARM-NEXT: entry:
-// CHECK-ARM-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR3]]
+// CHECK-ARM-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]])
// CHECK-ARM-NEXT: ret <4 x i32> [[VQRDMLSHQ_V3_I]]
//
// CHECK-AARCH64-LABEL: @test_vqrdmlshq_s32(
// CHECK-AARCH64-NEXT: entry:
-// CHECK-AARCH64-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]])
// CHECK-AARCH64-NEXT: ret <4 x i32> [[VQRDMLSHQ_V3_I]]
//
int32x4_t test_vqrdmlshq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
@@ -220,7 +220,7 @@ int32x4_t test_vqrdmlshq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
// CHECK-ARM-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
// CHECK-ARM-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-ARM-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-ARM-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) #[[ATTR3]]
+// CHECK-ARM-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]])
// CHECK-ARM-NEXT: ret <4 x i16> [[VQRDMLSH_V3_I]]
//
// CHECK-AARCH64-LABEL: @test_vqrdmlsh_lane_s16(
@@ -228,7 +228,7 @@ int32x4_t test_vqrdmlshq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
// CHECK-AARCH64-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
// CHECK-AARCH64-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-AARCH64-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-AARCH64-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]])
// CHECK-AARCH64-NEXT: ret <4 x i16> [[VQRDMLSH_V3_I]]
//
int16x4_t test_vqrdmlsh_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
@@ -241,7 +241,7 @@ int16x4_t test_vqrdmlsh_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
// CHECK-ARM-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
// CHECK-ARM-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-ARM-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK-ARM-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) #[[ATTR3]]
+// CHECK-ARM-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]])
// CHECK-ARM-NEXT: ret <2 x i32> [[VQRDMLSH_V3_I]]
//
// CHECK-AARCH64-LABEL: @test_vqrdmlsh_lane_s32(
@@ -249,7 +249,7 @@ int16x4_t test_vqrdmlsh_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
// CHECK-AARCH64-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
// CHECK-AARCH64-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-AARCH64-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK-AARCH64-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT: [[VQRDMLSH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]])
// CHECK-AARCH64-NEXT: ret <2 x i32> [[VQRDMLSH_V3_I]]
//
int32x2_t test_vqrdmlsh_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
@@ -262,7 +262,7 @@ int32x2_t test_vqrdmlsh_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
// CHECK-ARM-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
// CHECK-ARM-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-ARM-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK-ARM-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) #[[ATTR3]]
+// CHECK-ARM-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]])
// CHECK-ARM-NEXT: ret <8 x i16> [[VQRDMLSHQ_V3_I]]
//
// CHECK-AARCH64-LABEL: @test_vqrdmlshq_lane_s16(
@@ -270,7 +270,7 @@ int32x2_t test_vqrdmlsh_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
// CHECK-AARCH64-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
// CHECK-AARCH64-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
// CHECK-AARCH64-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK-AARCH64-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]])
// CHECK-AARCH64-NEXT: ret <8 x i16> [[VQRDMLSHQ_V3_I]]
//
int16x8_t test_vqrdmlshq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) {
@@ -283,7 +283,7 @@ int16x8_t test_vqrdmlshq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) {
// CHECK-ARM-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
// CHECK-ARM-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-ARM-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK-ARM-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) #[[ATTR3]]
+// CHECK-ARM-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]])
// CHECK-ARM-NEXT: ret <4 x i32> [[VQRDMLSHQ_V3_I]]
//
// CHECK-AARCH64-LABEL: @test_vqrdmlshq_lane_s32(
@@ -291,7 +291,7 @@ int16x8_t test_vqrdmlshq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) {
// CHECK-AARCH64-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
// CHECK-AARCH64-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
// CHECK-AARCH64-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK-AARCH64-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT: [[VQRDMLSHQ_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]])
// CHECK-AARCH64-NEXT: ret <4 x i32> [[VQRDMLSHQ_V3_I]]
//
int32x4_t test_vqrdmlshq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) {
diff --git a/clang/test/CodeGen/arm_acle.c b/clang/test/CodeGen/arm_acle.c
index 7fbaf36aec3dc..05515b2741753 100644
--- a/clang/test/CodeGen/arm_acle.c
+++ b/clang/test/CodeGen/arm_acle.c
@@ -56,12 +56,12 @@ void test_isb(void) {
/* 8.4 Hints */
// AArch32-LABEL: @test_yield(
// AArch32-NEXT: entry:
-// AArch32-NEXT: call void @llvm.arm.hint(i32 1) [[ATTR1:#.*]]
+// AArch32-NEXT: call void @llvm.arm.hint(i32 1)
// AArch32-NEXT: ret void
//
// AArch64-LABEL: @test_yield(
// AArch64-NEXT: entry:
-// AArch64-NEXT: call void @llvm.aarch64.hint(i32 1) [[ATTR3:#.*]]
+// AArch64-NEXT: call void @llvm.aarch64.hint(i32 1)
// AArch64-NEXT: ret void
//
void test_yield(void) {
@@ -70,12 +70,12 @@ void test_yield(void) {
// AArch32-LABEL: @test_wfe(
// AArch32-NEXT: entry:
-// AArch32-NEXT: call void @llvm.arm.hint(i32 2) [[ATTR1]]
+// AArch32-NEXT: call void @llvm.arm.hint(i32 2)
// AArch32-NEXT: ret void
//
// AArch64-LABEL: @test_wfe(
// AArch64-NEXT: entry:
-// AArch64-NEXT: call void @llvm.aarch64.hint(i32 2) [[ATTR3]]
+// AArch64-NEXT: call void @llvm.aarch64.hint(i32 2)
// AArch64-NEXT: ret void
//
void test_wfe(void) {
@@ -84,12 +84,12 @@ void test_wfe(void) {
// AArch32-LABEL: @test_wfi(
// AArch32-NEXT: entry:
-// AArch32-NEXT: call void @llvm.arm.hint(i32 3) [[ATTR1]]
+// AArch32-NEXT: call void @llvm.arm.hint(i32 3)
// AArch32-NEXT: ret void
//
// AArch64-LABEL: @test_wfi(
// AArch64-NEXT: entry:
-// AArch64-NEXT: call void @llvm.aarch64.hint(i32 3) [[ATTR3]]
+// AArch64-NEXT: call void @llvm.aarch64.hint(i32 3)
// AArch64-NEXT: ret void
//
void test_wfi(void) {
@@ -98,12 +98,12 @@ void test_wfi(void) {
// AArch32-LABEL: @test_sev(
// AArch32-NEXT: entry:
-// AArch32-NEXT: call void @llvm.arm.hint(i32 4) [[ATTR1]]
+// AArch32-NEXT: call void @llvm.arm.hint(i32 4)
// AArch32-NEXT: ret void
//
// AArch64-LABEL: @test_sev(
// AArch64-NEXT: entry:
-// AArch64-NEXT: call void @llvm.aarch64.hint(i32 4) [[ATTR3]]
+// AArch64-NEXT: call void @llvm.aarch64.hint(i32 4)
// AArch64-NEXT: ret void
//
void test_sev(void) {
@@ -112,12 +112,12 @@ void test_sev(void) {
// AArch32-LABEL: @test_sevl(
// AArch32-NEXT: entry:
-// AArch32-NEXT: call void @llvm.arm.hint(i32 5) [[ATTR1]]
+// AArch32-NEXT: call void @llvm.arm.hint(i32 5)
// AArch32-NEXT: ret void
//
// AArch64-LABEL: @test_sevl(
// AArch64-NEXT: entry:
-// AArch64-NEXT: call void @llvm.aarch64.hint(i32 5) [[ATTR3]]
+// AArch64-NEXT: call void @llvm.aarch64.hint(i32 5)
// AArch64-NEXT: ret void
//
void test_sevl(void) {
@@ -141,10 +141,10 @@ void test_dbg(void) {
// AArch32-NEXT: [[TMP0:%.*]] = bitcast i8* [[P:%.*]] to i32*
// AArch32-NEXT: br label [[DO_BODY_I:%.*]]
// AArch32: do.body.i:
-// AArch32-NEXT: [[LDREX_I:%.*]] = call i32 @llvm.arm.ldrex.p0i32(i32* elementtype(i32) [[TMP0]]) [[ATTR1]]
-// AArch32-NEXT: [[STREX_I:%.*]] = call i32 @llvm.arm.strex.p0i32(i32 [[X:%.*]], i32* elementtype(i32) [[TMP0]]) [[ATTR1]]
+// AArch32-NEXT: [[LDREX_I:%.*]] = call i32 @llvm.arm.ldrex.p0i32(i32* elementtype(i32) [[TMP0]])
+// AArch32-NEXT: [[STREX_I:%.*]] = call i32 @llvm.arm.strex.p0i32(i32 [[X:%.*]], i32* elementtype(i32) [[TMP0]])
// AArch32-NEXT: [[TOBOOL_I:%.*]] = icmp ne i32 [[STREX_I]], 0
-// AArch32-NEXT: br i1 [[TOBOOL_I]], label [[DO_BODY_I]], label [[__SWP_EXIT:%.*]], [[LOOP3:!llvm.loop !.*]]
+// AArch32-NEXT: br i1 [[TOBOOL_I]], label [[DO_BODY_I]], label [[__SWP_EXIT:%.*]], !llvm.loop [[LOOP7:![0-9]+]]
// AArch32: __swp.exit:
// AArch32-NEXT: ret void
//
@@ -153,12 +153,12 @@ void test_dbg(void) {
// AArch64-NEXT: [[TMP0:%.*]] = bitcast i8* [[P:%.*]] to i32*
// AArch64-NEXT: br label [[DO_BODY_I:%.*]]
// AArch64: do.body.i:
-// AArch64-NEXT: [[LDXR_I:%.*]] = call i64 @llvm.aarch64.ldxr.p0i32(i32* elementtype(i32) [[TMP0]]) [[ATTR3]]
+// AArch64-NEXT: [[LDXR_I:%.*]] = call i64 @llvm.aarch64.ldxr.p0i32(i32* elementtype(i32) [[TMP0]])
// AArch64-NEXT: [[TMP1:%.*]] = trunc i64 [[LDXR_I]] to i32
// AArch64-NEXT: [[TMP2:%.*]] = zext i32 [[X:%.*]] to i64
-// AArch64-NEXT: [[STXR_I:%.*]] = call i32 @llvm.aarch64.stxr.p0i32(i64 [[TMP2]], i32* elementtype(i32) [[TMP0]]) [[ATTR3]]
+// AArch64-NEXT: [[STXR_I:%.*]] = call i32 @llvm.aarch64.stxr.p0i32(i64 [[TMP2]], i32* elementtype(i32) [[TMP0]])
// AArch64-NEXT: [[TOBOOL_I:%.*]] = icmp ne i32 [[STXR_I]], 0
-// AArch64-NEXT: br i1 [[TOBOOL_I]], label [[DO_BODY_I]], label [[__SWP_EXIT:%.*]], [[LOOP6:!llvm.loop !.*]]
+// AArch64-NEXT: br i1 [[TOBOOL_I]], label [[DO_BODY_I]], label [[__SWP_EXIT:%.*]], !llvm.loop [[LOOP6:![0-9]+]]
// AArch64: __swp.exit:
// AArch64-NEXT: ret void
//
@@ -218,12 +218,12 @@ void test_plix() {
/* 8.7 NOP */
// AArch32-LABEL: @test_nop(
// AArch32-NEXT: entry:
-// AArch32-NEXT: call void @llvm.arm.hint(i32 0) [[ATTR1]]
+// AArch32-NEXT: call void @llvm.arm.hint(i32 0)
// AArch32-NEXT: ret void
//
// AArch64-LABEL: @test_nop(
// AArch64-NEXT: entry:
-// AArch64-NEXT: call void @llvm.aarch64.hint(i32 0) [[ATTR3]]
+// AArch64-NEXT: call void @llvm.aarch64.hint(i32 0)
// AArch64-NEXT: ret void
//
void test_nop(void) {
@@ -317,15 +317,10 @@ uint64_t test_rorll(uint64_t x, uint32_t y) {
return __rorll(x, y);
}
-// AArch32-LABEL: @test_clz(
-// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[T:%.*]], i1 false) [[ATTR1]]
-// AArch32-NEXT: ret i32 [[TMP0]]
-//
-// AArch64-LABEL: @test_clz(
-// AArch64-NEXT: entry:
-// AArch64-NEXT: [[TMP0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[T:%.*]], i1 false) [[ATTR3]]
-// AArch64-NEXT: ret i32 [[TMP0]]
+// ARM-LABEL: @test_clz(
+// ARM-NEXT: entry:
+// ARM-NEXT: [[TMP0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[T:%.*]], i1 false)
+// ARM-NEXT: ret i32 [[TMP0]]
//
uint32_t test_clz(uint32_t t) {
return __clz(t);
@@ -333,12 +328,12 @@ uint32_t test_clz(uint32_t t) {
// AArch32-LABEL: @test_clzl(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[T:%.*]], i1 false) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[T:%.*]], i1 false)
// AArch32-NEXT: ret i32 [[TMP0]]
//
// AArch64-LABEL: @test_clzl(
// AArch64-NEXT: entry:
-// AArch64-NEXT: [[TMP0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[T:%.*]], i1 false) [[ATTR3]]
+// AArch64-NEXT: [[TMP0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[T:%.*]], i1 false)
// AArch64-NEXT: [[CAST_I:%.*]] = trunc i64 [[TMP0]] to i32
// AArch64-NEXT: [[CONV_I:%.*]] = sext i32 [[CAST_I]] to i64
// AArch64-NEXT: ret i64 [[CONV_I]]
@@ -347,19 +342,12 @@ long test_clzl(long t) {
return __clzl(t);
}
-// AArch32-LABEL: @test_clzll(
-// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[T:%.*]], i1 false) [[ATTR1]]
-// AArch32-NEXT: [[CAST_I:%.*]] = trunc i64 [[TMP0]] to i32
-// AArch32-NEXT: [[CONV_I:%.*]] = sext i32 [[CAST_I]] to i64
-// AArch32-NEXT: ret i64 [[CONV_I]]
-//
-// AArch64-LABEL: @test_clzll(
-// AArch64-NEXT: entry:
-// AArch64-NEXT: [[TMP0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[T:%.*]], i1 false) [[ATTR3]]
-// AArch64-NEXT: [[CAST_I:%.*]] = trunc i64 [[TMP0]] to i32
-// AArch64-NEXT: [[CONV_I:%.*]] = sext i32 [[CAST_I]] to i64
-// AArch64-NEXT: ret i64 [[CONV_I]]
+// ARM-LABEL: @test_clzll(
+// ARM-NEXT: entry:
+// ARM-NEXT: [[TMP0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[T:%.*]], i1 false)
+// ARM-NEXT: [[CAST_I:%.*]] = trunc i64 [[TMP0]] to i32
+// ARM-NEXT: [[CONV_I:%.*]] = sext i32 [[CAST_I]] to i64
+// ARM-NEXT: ret i64 [[CONV_I]]
//
uint64_t test_clzll(uint64_t t) {
return __clzll(t);
@@ -367,12 +355,12 @@ uint64_t test_clzll(uint64_t t) {
// AArch32-LABEL: @test_cls(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[CLS_I:%.*]] = call i32 @llvm.arm.cls(i32 [[T:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[CLS_I:%.*]] = call i32 @llvm.arm.cls(i32 [[T:%.*]])
// AArch32-NEXT: ret i32 [[CLS_I]]
//
// AArch64-LABEL: @test_cls(
// AArch64-NEXT: entry:
-// AArch64-NEXT: [[CLS_I:%.*]] = call i32 @llvm.aarch64.cls(i32 [[T:%.*]]) [[ATTR3]]
+// AArch64-NEXT: [[CLS_I:%.*]] = call i32 @llvm.aarch64.cls(i32 [[T:%.*]])
// AArch64-NEXT: ret i32 [[CLS_I]]
//
unsigned test_cls(uint32_t t) {
@@ -381,12 +369,12 @@ unsigned test_cls(uint32_t t) {
// AArch32-LABEL: @test_clsl(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[CLS_I:%.*]] = call i32 @llvm.arm.cls(i32 [[T:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[CLS_I:%.*]] = call i32 @llvm.arm.cls(i32 [[T:%.*]])
// AArch32-NEXT: ret i32 [[CLS_I]]
//
// AArch64-LABEL: @test_clsl(
// AArch64-NEXT: entry:
-// AArch64-NEXT: [[CLS_I:%.*]] = call i32 @llvm.aarch64.cls64(i64 [[T:%.*]]) [[ATTR3]]
+// AArch64-NEXT: [[CLS_I:%.*]] = call i32 @llvm.aarch64.cls64(i64 [[T:%.*]])
// AArch64-NEXT: ret i32 [[CLS_I]]
//
unsigned test_clsl(unsigned long t) {
@@ -395,27 +383,22 @@ unsigned test_clsl(unsigned long t) {
// AArch32-LABEL: @test_clsll(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[CLS_I:%.*]] = call i32 @llvm.arm.cls64(i64 [[T:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[CLS_I:%.*]] = call i32 @llvm.arm.cls64(i64 [[T:%.*]])
// AArch32-NEXT: ret i32 [[CLS_I]]
//
// AArch64-LABEL: @test_clsll(
// AArch64-NEXT: entry:
-// AArch64-NEXT: [[CLS_I:%.*]] = call i32 @llvm.aarch64.cls64(i64 [[T:%.*]]) [[ATTR3]]
+// AArch64-NEXT: [[CLS_I:%.*]] = call i32 @llvm.aarch64.cls64(i64 [[T:%.*]])
// AArch64-NEXT: ret i32 [[CLS_I]]
//
unsigned test_clsll(uint64_t t) {
return __clsll(t);
}
-// AArch32-LABEL: @test_rev(
-// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.bswap.i32(i32 [[T:%.*]]) [[ATTR1]]
-// AArch32-NEXT: ret i32 [[TMP0]]
-//
-// AArch64-LABEL: @test_rev(
-// AArch64-NEXT: entry:
-// AArch64-NEXT: [[TMP0:%.*]] = call i32 @llvm.bswap.i32(i32 [[T:%.*]]) [[ATTR3]]
-// AArch64-NEXT: ret i32 [[TMP0]]
+// ARM-LABEL: @test_rev(
+// ARM-NEXT: entry:
+// ARM-NEXT: [[TMP0:%.*]] = call i32 @llvm.bswap.i32(i32 [[T:%.*]])
+// ARM-NEXT: ret i32 [[TMP0]]
//
uint32_t test_rev(uint32_t t) {
return __rev(t);
@@ -423,67 +406,44 @@ uint32_t test_rev(uint32_t t) {
// AArch32-LABEL: @test_revl(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.bswap.i32(i32 [[T:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.bswap.i32(i32 [[T:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
// AArch64-LABEL: @test_revl(
// AArch64-NEXT: entry:
-// AArch64-NEXT: [[TMP0:%.*]] = call i64 @llvm.bswap.i64(i64 [[T:%.*]]) [[ATTR3]]
+// AArch64-NEXT: [[TMP0:%.*]] = call i64 @llvm.bswap.i64(i64 [[T:%.*]])
// AArch64-NEXT: ret i64 [[TMP0]]
//
long test_revl(long t) {
return __revl(t);
}
-// AArch32-LABEL: @test_revll(
-// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i64 @llvm.bswap.i64(i64 [[T:%.*]]) [[ATTR1]]
-// AArch32-NEXT: ret i64 [[TMP0]]
-//
-// AArch64-LABEL: @test_revll(
-// AArch64-NEXT: entry:
-// AArch64-NEXT: [[TMP0:%.*]] = call i64 @llvm.bswap.i64(i64 [[T:%.*]]) [[ATTR3]]
-// AArch64-NEXT: ret i64 [[TMP0]]
+// ARM-LABEL: @test_revll(
+// ARM-NEXT: entry:
+// ARM-NEXT: [[TMP0:%.*]] = call i64 @llvm.bswap.i64(i64 [[T:%.*]])
+// ARM-NEXT: ret i64 [[TMP0]]
//
uint64_t test_revll(uint64_t t) {
return __revll(t);
}
-// AArch32-LABEL: @test_rev16(
-// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.bswap.i32(i32 [[T:%.*]]) [[ATTR1]]
-// AArch32-NEXT: [[REM_I_I:%.*]] = urem i32 16, 32
-// AArch32-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[REM_I_I]], 0
-// AArch32-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_END_I_I:%.*]]
-// AArch32: if.then.i.i:
-// AArch32-NEXT: br label [[__REV16_EXIT:%.*]]
-// AArch32: if.end.i.i:
-// AArch32-NEXT: [[SHR_I_I:%.*]] = lshr i32 [[TMP0]], [[REM_I_I]]
-// AArch32-NEXT: [[SUB_I_I:%.*]] = sub i32 32, [[REM_I_I]]
-// AArch32-NEXT: [[SHL_I_I:%.*]] = shl i32 [[TMP0]], [[SUB_I_I]]
-// AArch32-NEXT: [[OR_I_I:%.*]] = or i32 [[SHR_I_I]], [[SHL_I_I]]
-// AArch32-NEXT: br label [[__REV16_EXIT]]
-// AArch32: __rev16.exit:
-// AArch32-NEXT: [[RETVAL_I_I_0:%.*]] = phi i32 [ [[TMP0]], [[IF_THEN_I_I]] ], [ [[OR_I_I]], [[IF_END_I_I]] ]
-// AArch32-NEXT: ret i32 [[RETVAL_I_I_0]]
-//
-// AArch64-LABEL: @test_rev16(
-// AArch64-NEXT: entry:
-// AArch64-NEXT: [[TMP0:%.*]] = call i32 @llvm.bswap.i32(i32 [[T:%.*]]) [[ATTR3]]
-// AArch64-NEXT: [[REM_I_I:%.*]] = urem i32 16, 32
-// AArch64-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[REM_I_I]], 0
-// AArch64-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_END_I_I:%.*]]
-// AArch64: if.then.i.i:
-// AArch64-NEXT: br label [[__REV16_EXIT:%.*]]
-// AArch64: if.end.i.i:
-// AArch64-NEXT: [[SHR_I_I:%.*]] = lshr i32 [[TMP0]], [[REM_I_I]]
-// AArch64-NEXT: [[SUB_I_I:%.*]] = sub i32 32, [[REM_I_I]]
-// AArch64-NEXT: [[SHL_I_I:%.*]] = shl i32 [[TMP0]], [[SUB_I_I]]
-// AArch64-NEXT: [[OR_I_I:%.*]] = or i32 [[SHR_I_I]], [[SHL_I_I]]
-// AArch64-NEXT: br label [[__REV16_EXIT]]
-// AArch64: __rev16.exit:
-// AArch64-NEXT: [[RETVAL_I_I_0:%.*]] = phi i32 [ [[TMP0]], [[IF_THEN_I_I]] ], [ [[OR_I_I]], [[IF_END_I_I]] ]
-// AArch64-NEXT: ret i32 [[RETVAL_I_I_0]]
+// ARM-LABEL: @test_rev16(
+// ARM-NEXT: entry:
+// ARM-NEXT: [[TMP0:%.*]] = call i32 @llvm.bswap.i32(i32 [[T:%.*]])
+// ARM-NEXT: [[REM_I_I:%.*]] = urem i32 16, 32
+// ARM-NEXT: [[CMP_I_I:%.*]] = icmp eq i32 [[REM_I_I]], 0
+// ARM-NEXT: br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_END_I_I:%.*]]
+// ARM: if.then.i.i:
+// ARM-NEXT: br label [[__REV16_EXIT:%.*]]
+// ARM: if.end.i.i:
+// ARM-NEXT: [[SHR_I_I:%.*]] = lshr i32 [[TMP0]], [[REM_I_I]]
+// ARM-NEXT: [[SUB_I_I:%.*]] = sub i32 32, [[REM_I_I]]
+// ARM-NEXT: [[SHL_I_I:%.*]] = shl i32 [[TMP0]], [[SUB_I_I]]
+// ARM-NEXT: [[OR_I_I:%.*]] = or i32 [[SHR_I_I]], [[SHL_I_I]]
+// ARM-NEXT: br label [[__REV16_EXIT]]
+// ARM: __rev16.exit:
+// ARM-NEXT: [[RETVAL_I_I_0:%.*]] = phi i32 [ [[TMP0]], [[IF_THEN_I_I]] ], [ [[OR_I_I]], [[IF_END_I_I]] ]
+// ARM-NEXT: ret i32 [[RETVAL_I_I_0]]
//
uint32_t test_rev16(uint32_t t) {
return __rev16(t);
@@ -491,7 +451,7 @@ uint32_t test_rev16(uint32_t t) {
// AArch32-LABEL: @test_rev16l(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.bswap.i32(i32 [[T:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.bswap.i32(i32 [[T:%.*]])
// AArch32-NEXT: [[REM_I_I_I:%.*]] = urem i32 16, 32
// AArch32-NEXT: [[CMP_I_I_I:%.*]] = icmp eq i32 [[REM_I_I_I]], 0
// AArch32-NEXT: br i1 [[CMP_I_I_I]], label [[IF_THEN_I_I_I:%.*]], label [[IF_END_I_I_I:%.*]]
@@ -511,7 +471,7 @@ uint32_t test_rev16(uint32_t t) {
// AArch64-NEXT: entry:
// AArch64-NEXT: [[SHR_I:%.*]] = lshr i64 [[T:%.*]], 32
// AArch64-NEXT: [[CONV_I:%.*]] = trunc i64 [[SHR_I]] to i32
-// AArch64-NEXT: [[TMP0:%.*]] = call i32 @llvm.bswap.i32(i32 [[CONV_I]]) [[ATTR3]]
+// AArch64-NEXT: [[TMP0:%.*]] = call i32 @llvm.bswap.i32(i32 [[CONV_I]])
// AArch64-NEXT: [[REM_I_I10_I:%.*]] = urem i32 16, 32
// AArch64-NEXT: [[CMP_I_I11_I:%.*]] = icmp eq i32 [[REM_I_I10_I]], 0
// AArch64-NEXT: br i1 [[CMP_I_I11_I]], label [[IF_THEN_I_I12_I:%.*]], label [[IF_END_I_I17_I:%.*]]
@@ -528,7 +488,7 @@ uint32_t test_rev16(uint32_t t) {
// AArch64-NEXT: [[CONV1_I:%.*]] = zext i32 [[RETVAL_I_I6_I_0]] to i64
// AArch64-NEXT: [[SHL_I:%.*]] = shl i64 [[CONV1_I]], 32
// AArch64-NEXT: [[CONV2_I:%.*]] = trunc i64 [[T]] to i32
-// AArch64-NEXT: [[TMP1:%.*]] = call i32 @llvm.bswap.i32(i32 [[CONV2_I]]) [[ATTR3]]
+// AArch64-NEXT: [[TMP1:%.*]] = call i32 @llvm.bswap.i32(i32 [[CONV2_I]])
// AArch64-NEXT: [[REM_I_I_I:%.*]] = urem i32 16, 32
// AArch64-NEXT: [[CMP_I_I_I:%.*]] = icmp eq i32 [[REM_I_I_I]], 0
// AArch64-NEXT: br i1 [[CMP_I_I_I]], label [[IF_THEN_I_I_I:%.*]], label [[IF_END_I_I_I:%.*]]
@@ -550,111 +510,62 @@ long test_rev16l(long t) {
return __rev16l(t);
}
-// AArch32-LABEL: @test_rev16ll(
-// AArch32-NEXT: entry:
-// AArch32-NEXT: [[SHR_I:%.*]] = lshr i64 [[T:%.*]], 32
-// AArch32-NEXT: [[CONV_I:%.*]] = trunc i64 [[SHR_I]] to i32
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.bswap.i32(i32 [[CONV_I]]) [[ATTR1]]
-// AArch32-NEXT: [[REM_I_I10_I:%.*]] = urem i32 16, 32
-// AArch32-NEXT: [[CMP_I_I11_I:%.*]] = icmp eq i32 [[REM_I_I10_I]], 0
-// AArch32-NEXT: br i1 [[CMP_I_I11_I]], label [[IF_THEN_I_I12_I:%.*]], label [[IF_END_I_I17_I:%.*]]
-// AArch32: if.then.i.i12.i:
-// AArch32-NEXT: br label [[__REV16_EXIT18_I:%.*]]
-// AArch32: if.end.i.i17.i:
-// AArch32-NEXT: [[SHR_I_I13_I:%.*]] = lshr i32 [[TMP0]], [[REM_I_I10_I]]
-// AArch32-NEXT: [[SUB_I_I14_I:%.*]] = sub i32 32, [[REM_I_I10_I]]
-// AArch32-NEXT: [[SHL_I_I15_I:%.*]] = shl i32 [[TMP0]], [[SUB_I_I14_I]]
-// AArch32-NEXT: [[OR_I_I16_I:%.*]] = or i32 [[SHR_I_I13_I]], [[SHL_I_I15_I]]
-// AArch32-NEXT: br label [[__REV16_EXIT18_I]]
-// AArch32: __rev16.exit18.i:
-// AArch32-NEXT: [[RETVAL_I_I6_I_0:%.*]] = phi i32 [ [[TMP0]], [[IF_THEN_I_I12_I]] ], [ [[OR_I_I16_I]], [[IF_END_I_I17_I]] ]
-// AArch32-NEXT: [[CONV1_I:%.*]] = zext i32 [[RETVAL_I_I6_I_0]] to i64
-// AArch32-NEXT: [[SHL_I:%.*]] = shl i64 [[CONV1_I]], 32
-// AArch32-NEXT: [[CONV2_I:%.*]] = trunc i64 [[T]] to i32
-// AArch32-NEXT: [[TMP1:%.*]] = call i32 @llvm.bswap.i32(i32 [[CONV2_I]]) [[ATTR1]]
-// AArch32-NEXT: [[REM_I_I_I:%.*]] = urem i32 16, 32
-// AArch32-NEXT: [[CMP_I_I_I:%.*]] = icmp eq i32 [[REM_I_I_I]], 0
-// AArch32-NEXT: br i1 [[CMP_I_I_I]], label [[IF_THEN_I_I_I:%.*]], label [[IF_END_I_I_I:%.*]]
-// AArch32: if.then.i.i.i:
-// AArch32-NEXT: br label [[__REV16LL_EXIT:%.*]]
-// AArch32: if.end.i.i.i:
-// AArch32-NEXT: [[SHR_I_I_I:%.*]] = lshr i32 [[TMP1]], [[REM_I_I_I]]
-// AArch32-NEXT: [[SUB_I_I_I:%.*]] = sub i32 32, [[REM_I_I_I]]
-// AArch32-NEXT: [[SHL_I_I_I:%.*]] = shl i32 [[TMP1]], [[SUB_I_I_I]]
-// AArch32-NEXT: [[OR_I_I_I:%.*]] = or i32 [[SHR_I_I_I]], [[SHL_I_I_I]]
-// AArch32-NEXT: br label [[__REV16LL_EXIT]]
-// AArch32: __rev16ll.exit:
-// AArch32-NEXT: [[RETVAL_I_I_I_0:%.*]] = phi i32 [ [[TMP1]], [[IF_THEN_I_I_I]] ], [ [[OR_I_I_I]], [[IF_END_I_I_I]] ]
-// AArch32-NEXT: [[CONV4_I:%.*]] = zext i32 [[RETVAL_I_I_I_0]] to i64
-// AArch32-NEXT: [[OR_I:%.*]] = or i64 [[SHL_I]], [[CONV4_I]]
-// AArch32-NEXT: ret i64 [[OR_I]]
-//
-// AArch64-LABEL: @test_rev16ll(
-// AArch64-NEXT: entry:
-// AArch64-NEXT: [[SHR_I:%.*]] = lshr i64 [[T:%.*]], 32
-// AArch64-NEXT: [[CONV_I:%.*]] = trunc i64 [[SHR_I]] to i32
-// AArch64-NEXT: [[TMP0:%.*]] = call i32 @llvm.bswap.i32(i32 [[CONV_I]]) [[ATTR3]]
-// AArch64-NEXT: [[REM_I_I10_I:%.*]] = urem i32 16, 32
-// AArch64-NEXT: [[CMP_I_I11_I:%.*]] = icmp eq i32 [[REM_I_I10_I]], 0
-// AArch64-NEXT: br i1 [[CMP_I_I11_I]], label [[IF_THEN_I_I12_I:%.*]], label [[IF_END_I_I17_I:%.*]]
-// AArch64: if.then.i.i12.i:
-// AArch64-NEXT: br label [[__REV16_EXIT18_I:%.*]]
-// AArch64: if.end.i.i17.i:
-// AArch64-NEXT: [[SHR_I_I13_I:%.*]] = lshr i32 [[TMP0]], [[REM_I_I10_I]]
-// AArch64-NEXT: [[SUB_I_I14_I:%.*]] = sub i32 32, [[REM_I_I10_I]]
-// AArch64-NEXT: [[SHL_I_I15_I:%.*]] = shl i32 [[TMP0]], [[SUB_I_I14_I]]
-// AArch64-NEXT: [[OR_I_I16_I:%.*]] = or i32 [[SHR_I_I13_I]], [[SHL_I_I15_I]]
-// AArch64-NEXT: br label [[__REV16_EXIT18_I]]
-// AArch64: __rev16.exit18.i:
-// AArch64-NEXT: [[RETVAL_I_I6_I_0:%.*]] = phi i32 [ [[TMP0]], [[IF_THEN_I_I12_I]] ], [ [[OR_I_I16_I]], [[IF_END_I_I17_I]] ]
-// AArch64-NEXT: [[CONV1_I:%.*]] = zext i32 [[RETVAL_I_I6_I_0]] to i64
-// AArch64-NEXT: [[SHL_I:%.*]] = shl i64 [[CONV1_I]], 32
-// AArch64-NEXT: [[CONV2_I:%.*]] = trunc i64 [[T]] to i32
-// AArch64-NEXT: [[TMP1:%.*]] = call i32 @llvm.bswap.i32(i32 [[CONV2_I]]) [[ATTR3]]
-// AArch64-NEXT: [[REM_I_I_I:%.*]] = urem i32 16, 32
-// AArch64-NEXT: [[CMP_I_I_I:%.*]] = icmp eq i32 [[REM_I_I_I]], 0
-// AArch64-NEXT: br i1 [[CMP_I_I_I]], label [[IF_THEN_I_I_I:%.*]], label [[IF_END_I_I_I:%.*]]
-// AArch64: if.then.i.i.i:
-// AArch64-NEXT: br label [[__REV16LL_EXIT:%.*]]
-// AArch64: if.end.i.i.i:
-// AArch64-NEXT: [[SHR_I_I_I:%.*]] = lshr i32 [[TMP1]], [[REM_I_I_I]]
-// AArch64-NEXT: [[SUB_I_I_I:%.*]] = sub i32 32, [[REM_I_I_I]]
-// AArch64-NEXT: [[SHL_I_I_I:%.*]] = shl i32 [[TMP1]], [[SUB_I_I_I]]
-// AArch64-NEXT: [[OR_I_I_I:%.*]] = or i32 [[SHR_I_I_I]], [[SHL_I_I_I]]
-// AArch64-NEXT: br label [[__REV16LL_EXIT]]
-// AArch64: __rev16ll.exit:
-// AArch64-NEXT: [[RETVAL_I_I_I_0:%.*]] = phi i32 [ [[TMP1]], [[IF_THEN_I_I_I]] ], [ [[OR_I_I_I]], [[IF_END_I_I_I]] ]
-// AArch64-NEXT: [[CONV4_I:%.*]] = zext i32 [[RETVAL_I_I_I_0]] to i64
-// AArch64-NEXT: [[OR_I:%.*]] = or i64 [[SHL_I]], [[CONV4_I]]
-// AArch64-NEXT: ret i64 [[OR_I]]
+// ARM-LABEL: @test_rev16ll(
+// ARM-NEXT: entry:
+// ARM-NEXT: [[SHR_I:%.*]] = lshr i64 [[T:%.*]], 32
+// ARM-NEXT: [[CONV_I:%.*]] = trunc i64 [[SHR_I]] to i32
+// ARM-NEXT: [[TMP0:%.*]] = call i32 @llvm.bswap.i32(i32 [[CONV_I]])
+// ARM-NEXT: [[REM_I_I10_I:%.*]] = urem i32 16, 32
+// ARM-NEXT: [[CMP_I_I11_I:%.*]] = icmp eq i32 [[REM_I_I10_I]], 0
+// ARM-NEXT: br i1 [[CMP_I_I11_I]], label [[IF_THEN_I_I12_I:%.*]], label [[IF_END_I_I17_I:%.*]]
+// ARM: if.then.i.i12.i:
+// ARM-NEXT: br label [[__REV16_EXIT18_I:%.*]]
+// ARM: if.end.i.i17.i:
+// ARM-NEXT: [[SHR_I_I13_I:%.*]] = lshr i32 [[TMP0]], [[REM_I_I10_I]]
+// ARM-NEXT: [[SUB_I_I14_I:%.*]] = sub i32 32, [[REM_I_I10_I]]
+// ARM-NEXT: [[SHL_I_I15_I:%.*]] = shl i32 [[TMP0]], [[SUB_I_I14_I]]
+// ARM-NEXT: [[OR_I_I16_I:%.*]] = or i32 [[SHR_I_I13_I]], [[SHL_I_I15_I]]
+// ARM-NEXT: br label [[__REV16_EXIT18_I]]
+// ARM: __rev16.exit18.i:
+// ARM-NEXT: [[RETVAL_I_I6_I_0:%.*]] = phi i32 [ [[TMP0]], [[IF_THEN_I_I12_I]] ], [ [[OR_I_I16_I]], [[IF_END_I_I17_I]] ]
+// ARM-NEXT: [[CONV1_I:%.*]] = zext i32 [[RETVAL_I_I6_I_0]] to i64
+// ARM-NEXT: [[SHL_I:%.*]] = shl i64 [[CONV1_I]], 32
+// ARM-NEXT: [[CONV2_I:%.*]] = trunc i64 [[T]] to i32
+// ARM-NEXT: [[TMP1:%.*]] = call i32 @llvm.bswap.i32(i32 [[CONV2_I]])
+// ARM-NEXT: [[REM_I_I_I:%.*]] = urem i32 16, 32
+// ARM-NEXT: [[CMP_I_I_I:%.*]] = icmp eq i32 [[REM_I_I_I]], 0
+// ARM-NEXT: br i1 [[CMP_I_I_I]], label [[IF_THEN_I_I_I:%.*]], label [[IF_END_I_I_I:%.*]]
+// ARM: if.then.i.i.i:
+// ARM-NEXT: br label [[__REV16LL_EXIT:%.*]]
+// ARM: if.end.i.i.i:
+// ARM-NEXT: [[SHR_I_I_I:%.*]] = lshr i32 [[TMP1]], [[REM_I_I_I]]
+// ARM-NEXT: [[SUB_I_I_I:%.*]] = sub i32 32, [[REM_I_I_I]]
+// ARM-NEXT: [[SHL_I_I_I:%.*]] = shl i32 [[TMP1]], [[SUB_I_I_I]]
+// ARM-NEXT: [[OR_I_I_I:%.*]] = or i32 [[SHR_I_I_I]], [[SHL_I_I_I]]
+// ARM-NEXT: br label [[__REV16LL_EXIT]]
+// ARM: __rev16ll.exit:
+// ARM-NEXT: [[RETVAL_I_I_I_0:%.*]] = phi i32 [ [[TMP1]], [[IF_THEN_I_I_I]] ], [ [[OR_I_I_I]], [[IF_END_I_I_I]] ]
+// ARM-NEXT: [[CONV4_I:%.*]] = zext i32 [[RETVAL_I_I_I_0]] to i64
+// ARM-NEXT: [[OR_I:%.*]] = or i64 [[SHL_I]], [[CONV4_I]]
+// ARM-NEXT: ret i64 [[OR_I]]
//
uint64_t test_rev16ll(uint64_t t) {
return __rev16ll(t);
}
-// AArch32-LABEL: @test_revsh(
-// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i16 @llvm.bswap.i16(i16 [[T:%.*]]) [[ATTR1]]
-// AArch32-NEXT: ret i16 [[TMP0]]
-//
-// AArch64-LABEL: @test_revsh(
-// AArch64-NEXT: entry:
-// AArch64-NEXT: [[TMP0:%.*]] = call i16 @llvm.bswap.i16(i16 [[T:%.*]]) [[ATTR3]]
-// AArch64-NEXT: ret i16 [[TMP0]]
+// ARM-LABEL: @test_revsh(
+// ARM-NEXT: entry:
+// ARM-NEXT: [[TMP0:%.*]] = call i16 @llvm.bswap.i16(i16 [[T:%.*]])
+// ARM-NEXT: ret i16 [[TMP0]]
//
int16_t test_revsh(int16_t t) {
return __revsh(t);
}
-// AArch32-LABEL: @test_rbit(
-// AArch32-NEXT: entry:
-// AArch32-NEXT: [[RBIT_I:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[T:%.*]]) [[ATTR1]]
-// AArch32-NEXT: ret i32 [[RBIT_I]]
-//
-// AArch64-LABEL: @test_rbit(
-// AArch64-NEXT: entry:
-// AArch64-NEXT: [[RBIT_I:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[T:%.*]]) [[ATTR3]]
-// AArch64-NEXT: ret i32 [[RBIT_I]]
+// ARM-LABEL: @test_rbit(
+// ARM-NEXT: entry:
+// ARM-NEXT: [[RBIT_I:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[T:%.*]])
+// ARM-NEXT: ret i32 [[RBIT_I]]
//
uint32_t test_rbit(uint32_t t) {
return __rbit(t);
@@ -662,12 +573,12 @@ uint32_t test_rbit(uint32_t t) {
// AArch32-LABEL: @test_rbitl(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[RBIT_I_I:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[T:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[RBIT_I_I:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[T:%.*]])
// AArch32-NEXT: ret i32 [[RBIT_I_I]]
//
// AArch64-LABEL: @test_rbitl(
// AArch64-NEXT: entry:
-// AArch64-NEXT: [[RBIT_I:%.*]] = call i64 @llvm.bitreverse.i64(i64 [[T:%.*]]) [[ATTR3]]
+// AArch64-NEXT: [[RBIT_I:%.*]] = call i64 @llvm.bitreverse.i64(i64 [[T:%.*]])
// AArch64-NEXT: ret i64 [[RBIT_I]]
//
long test_rbitl(long t) {
@@ -677,19 +588,19 @@ long test_rbitl(long t) {
// AArch32-LABEL: @test_rbitll(
// AArch32-NEXT: entry:
// AArch32-NEXT: [[CONV_I:%.*]] = trunc i64 [[T:%.*]] to i32
-// AArch32-NEXT: [[RBIT_I:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[CONV_I]]) [[ATTR1]]
+// AArch32-NEXT: [[RBIT_I:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[CONV_I]])
// AArch32-NEXT: [[CONV1_I:%.*]] = zext i32 [[RBIT_I]] to i64
// AArch32-NEXT: [[SHL_I:%.*]] = shl i64 [[CONV1_I]], 32
// AArch32-NEXT: [[SHR_I:%.*]] = lshr i64 [[T]], 32
// AArch32-NEXT: [[CONV2_I:%.*]] = trunc i64 [[SHR_I]] to i32
-// AArch32-NEXT: [[RBIT3_I:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[CONV2_I]]) [[ATTR1]]
+// AArch32-NEXT: [[RBIT3_I:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[CONV2_I]])
// AArch32-NEXT: [[CONV4_I:%.*]] = zext i32 [[RBIT3_I]] to i64
// AArch32-NEXT: [[OR_I:%.*]] = or i64 [[SHL_I]], [[CONV4_I]]
// AArch32-NEXT: ret i64 [[OR_I]]
//
// AArch64-LABEL: @test_rbitll(
// AArch64-NEXT: entry:
-// AArch64-NEXT: [[RBIT_I:%.*]] = call i64 @llvm.bitreverse.i64(i64 [[T:%.*]]) [[ATTR3]]
+// AArch64-NEXT: [[RBIT_I:%.*]] = call i64 @llvm.bitreverse.i64(i64 [[T:%.*]])
// AArch64-NEXT: ret i64 [[RBIT_I]]
//
uint64_t test_rbitll(uint64_t t) {
@@ -722,7 +633,7 @@ uint32_t test_usat(int32_t t) {
#ifdef __ARM_FEATURE_DSP
// AArch32-LABEL: @test_qadd(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.qadd(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.qadd(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
int32_t test_qadd(int32_t a, int32_t b) {
@@ -731,7 +642,7 @@ int32_t test_qadd(int32_t a, int32_t b) {
// AArch32-LABEL: @test_qsub(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.qsub(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.qsub(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
int32_t test_qsub(int32_t a, int32_t b) {
@@ -741,8 +652,8 @@ int32_t test_qsub(int32_t a, int32_t b) {
extern int32_t f();
// AArch32-LABEL: @test_qdbl(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[CALL:%.*]] = call i32 bitcast (i32 (...)* @f to i32 ()*)() [[ATTR7:#.*]]
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.qadd(i32 [[CALL]], i32 [[CALL]]) [[ATTR1]]
+// AArch32-NEXT: [[CALL:%.*]] = call i32 bitcast (i32 (...)* @f to i32 ()*)() #[[ATTR7:[0-9]+]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.qadd(i32 [[CALL]], i32 [[CALL]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
int32_t test_qdbl() {
@@ -756,7 +667,7 @@ int32_t test_qdbl() {
#if __ARM_FEATURE_DSP
// AArch32-LABEL: @test_smulbb(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smulbb(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smulbb(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
int32_t test_smulbb(int32_t a, int32_t b) {
@@ -765,7 +676,7 @@ int32_t test_smulbb(int32_t a, int32_t b) {
// AArch32-LABEL: @test_smulbt(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smulbt(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smulbt(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
int32_t test_smulbt(int32_t a, int32_t b) {
@@ -774,7 +685,7 @@ int32_t test_smulbt(int32_t a, int32_t b) {
// AArch32-LABEL: @test_smultb(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smultb(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smultb(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
int32_t test_smultb(int32_t a, int32_t b) {
@@ -783,7 +694,7 @@ int32_t test_smultb(int32_t a, int32_t b) {
// AArch32-LABEL: @test_smultt(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smultt(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smultt(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
int32_t test_smultt(int32_t a, int32_t b) {
@@ -792,7 +703,7 @@ int32_t test_smultt(int32_t a, int32_t b) {
// AArch32-LABEL: @test_smulwb(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smulwb(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smulwb(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
int32_t test_smulwb(int32_t a, int32_t b) {
@@ -801,7 +712,7 @@ int32_t test_smulwb(int32_t a, int32_t b) {
// AArch32-LABEL: @test_smulwt(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smulwt(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smulwt(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
int32_t test_smulwt(int32_t a, int32_t b) {
@@ -813,7 +724,7 @@ int32_t test_smulwt(int32_t a, int32_t b) {
#if __ARM_FEATURE_DSP
// AArch32-LABEL: @test_smlabb(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smlabb(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smlabb(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
int32_t test_smlabb(int32_t a, int32_t b, int32_t c) {
@@ -822,7 +733,7 @@ int32_t test_smlabb(int32_t a, int32_t b, int32_t c) {
// AArch32-LABEL: @test_smlabt(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smlabt(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smlabt(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
int32_t test_smlabt(int32_t a, int32_t b, int32_t c) {
@@ -831,7 +742,7 @@ int32_t test_smlabt(int32_t a, int32_t b, int32_t c) {
// AArch32-LABEL: @test_smlatb(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smlatb(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smlatb(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
int32_t test_smlatb(int32_t a, int32_t b, int32_t c) {
@@ -840,7 +751,7 @@ int32_t test_smlatb(int32_t a, int32_t b, int32_t c) {
// AArch32-LABEL: @test_smlatt(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smlatt(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smlatt(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
int32_t test_smlatt(int32_t a, int32_t b, int32_t c) {
@@ -849,7 +760,7 @@ int32_t test_smlatt(int32_t a, int32_t b, int32_t c) {
// AArch32-LABEL: @test_smlawb(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smlawb(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smlawb(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
int32_t test_smlawb(int32_t a, int32_t b, int32_t c) {
@@ -858,7 +769,7 @@ int32_t test_smlawb(int32_t a, int32_t b, int32_t c) {
// AArch32-LABEL: @test_smlawt(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smlawt(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smlawt(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
int32_t test_smlawt(int32_t a, int32_t b, int32_t c) {
@@ -891,7 +802,7 @@ uint16x2_t test_usat16(int16x2_t a) {
#if __ARM_FEATURE_SIMD32
// AArch32-LABEL: @test_sxtab16(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.sxtab16(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.sxtab16(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
int16x2_t test_sxtab16(int16x2_t a, int8x4_t b) {
@@ -900,7 +811,7 @@ int16x2_t test_sxtab16(int16x2_t a, int8x4_t b) {
// AArch32-LABEL: @test_sxtb16(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.sxtb16(i32 [[A:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.sxtb16(i32 [[A:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
int16x2_t test_sxtb16(int8x4_t a) {
@@ -909,7 +820,7 @@ int16x2_t test_sxtb16(int8x4_t a) {
// AArch32-LABEL: @test_uxtab16(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uxtab16(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uxtab16(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
int16x2_t test_uxtab16(int16x2_t a, int8x4_t b) {
@@ -918,7 +829,7 @@ int16x2_t test_uxtab16(int16x2_t a, int8x4_t b) {
// AArch32-LABEL: @test_uxtb16(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uxtb16(i32 [[A:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uxtb16(i32 [[A:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
int16x2_t test_uxtb16(int8x4_t a) {
@@ -930,7 +841,7 @@ int16x2_t test_uxtb16(int8x4_t a) {
#if __ARM_FEATURE_SIMD32
// AArch32-LABEL: @test_sel(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.sel(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.sel(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
uint8x4_t test_sel(uint8x4_t a, uint8x4_t b) {
@@ -942,7 +853,7 @@ uint8x4_t test_sel(uint8x4_t a, uint8x4_t b) {
#if __ARM_FEATURE_SIMD32
// AArch32-LABEL: @test_qadd8(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.qadd8(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.qadd8(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
int16x2_t test_qadd8(int8x4_t a, int8x4_t b) {
@@ -951,7 +862,7 @@ int16x2_t test_qadd8(int8x4_t a, int8x4_t b) {
// AArch32-LABEL: @test_qsub8(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.qsub8(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.qsub8(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
int8x4_t test_qsub8(int8x4_t a, int8x4_t b) {
@@ -960,7 +871,7 @@ int8x4_t test_qsub8(int8x4_t a, int8x4_t b) {
// AArch32-LABEL: @test_sadd8(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.sadd8(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.sadd8(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
int8x4_t test_sadd8(int8x4_t a, int8x4_t b) {
@@ -969,7 +880,7 @@ int8x4_t test_sadd8(int8x4_t a, int8x4_t b) {
// AArch32-LABEL: @test_shadd8(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.shadd8(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.shadd8(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
int8x4_t test_shadd8(int8x4_t a, int8x4_t b) {
@@ -978,7 +889,7 @@ int8x4_t test_shadd8(int8x4_t a, int8x4_t b) {
// AArch32-LABEL: @test_shsub8(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.shsub8(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.shsub8(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
int8x4_t test_shsub8(int8x4_t a, int8x4_t b) {
@@ -987,7 +898,7 @@ int8x4_t test_shsub8(int8x4_t a, int8x4_t b) {
// AArch32-LABEL: @test_ssub8(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.ssub8(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.ssub8(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
int8x4_t test_ssub8(int8x4_t a, int8x4_t b) {
@@ -996,7 +907,7 @@ int8x4_t test_ssub8(int8x4_t a, int8x4_t b) {
// AArch32-LABEL: @test_uadd8(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uadd8(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uadd8(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
uint8x4_t test_uadd8(uint8x4_t a, uint8x4_t b) {
@@ -1005,7 +916,7 @@ uint8x4_t test_uadd8(uint8x4_t a, uint8x4_t b) {
// AArch32-LABEL: @test_uhadd8(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uhadd8(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uhadd8(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
uint8x4_t test_uhadd8(uint8x4_t a, uint8x4_t b) {
@@ -1014,7 +925,7 @@ uint8x4_t test_uhadd8(uint8x4_t a, uint8x4_t b) {
// AArch32-LABEL: @test_uhsub8(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uhsub8(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uhsub8(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
uint8x4_t test_uhsub8(uint8x4_t a, uint8x4_t b) {
@@ -1023,7 +934,7 @@ uint8x4_t test_uhsub8(uint8x4_t a, uint8x4_t b) {
// AArch32-LABEL: @test_uqadd8(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uqadd8(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uqadd8(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
uint8x4_t test_uqadd8(uint8x4_t a, uint8x4_t b) {
@@ -1032,7 +943,7 @@ uint8x4_t test_uqadd8(uint8x4_t a, uint8x4_t b) {
// AArch32-LABEL: @test_uqsub8(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uqsub8(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uqsub8(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
uint8x4_t test_uqsub8(uint8x4_t a, uint8x4_t b) {
@@ -1041,7 +952,7 @@ uint8x4_t test_uqsub8(uint8x4_t a, uint8x4_t b) {
// AArch32-LABEL: @test_usub8(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.usub8(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.usub8(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
uint8x4_t test_usub8(uint8x4_t a, uint8x4_t b) {
@@ -1053,7 +964,7 @@ uint8x4_t test_usub8(uint8x4_t a, uint8x4_t b) {
#if __ARM_FEATURE_SIMD32
// AArch32-LABEL: @test_usad8(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.usad8(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.usad8(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
uint32_t test_usad8(uint8x4_t a, uint8x4_t b) {
@@ -1065,7 +976,7 @@ uint32_t test_usad8(uint8x4_t a, uint8x4_t b) {
// AArch32-NEXT: [[CONV:%.*]] = zext i8 [[A:%.*]] to i32
// AArch32-NEXT: [[CONV1:%.*]] = zext i8 [[B:%.*]] to i32
// AArch32-NEXT: [[CONV2:%.*]] = zext i8 [[C:%.*]] to i32
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.usada8(i32 [[CONV]], i32 [[CONV1]], i32 [[CONV2]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.usada8(i32 [[CONV]], i32 [[CONV1]], i32 [[CONV2]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
uint32_t test_usada8(uint8_t a, uint8_t b, uint8_t c) {
@@ -1077,7 +988,7 @@ uint32_t test_usada8(uint8_t a, uint8_t b, uint8_t c) {
#if __ARM_FEATURE_SIMD32
// AArch32-LABEL: @test_qadd16(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.qadd16(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.qadd16(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
int16x2_t test_qadd16(int16x2_t a, int16x2_t b) {
@@ -1086,7 +997,7 @@ int16x2_t test_qadd16(int16x2_t a, int16x2_t b) {
// AArch32-LABEL: @test_qasx(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.qasx(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.qasx(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
int16x2_t test_qasx(int16x2_t a, int16x2_t b) {
@@ -1095,7 +1006,7 @@ int16x2_t test_qasx(int16x2_t a, int16x2_t b) {
// AArch32-LABEL: @test_qsax(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.qsax(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.qsax(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
int16x2_t test_qsax(int16x2_t a, int16x2_t b) {
@@ -1104,7 +1015,7 @@ int16x2_t test_qsax(int16x2_t a, int16x2_t b) {
// AArch32-LABEL: @test_qsub16(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.qsub16(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.qsub16(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
int16x2_t test_qsub16(int16x2_t a, int16x2_t b) {
@@ -1113,7 +1024,7 @@ int16x2_t test_qsub16(int16x2_t a, int16x2_t b) {
// AArch32-LABEL: @test_sadd16(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.sadd16(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.sadd16(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
int16x2_t test_sadd16(int16x2_t a, int16x2_t b) {
@@ -1122,7 +1033,7 @@ int16x2_t test_sadd16(int16x2_t a, int16x2_t b) {
// AArch32-LABEL: @test_sasx(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.sasx(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.sasx(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
int16x2_t test_sasx(int16x2_t a, int16x2_t b) {
@@ -1131,7 +1042,7 @@ int16x2_t test_sasx(int16x2_t a, int16x2_t b) {
// AArch32-LABEL: @test_shadd16(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.shadd16(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.shadd16(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
int16x2_t test_shadd16(int16x2_t a, int16x2_t b) {
@@ -1140,7 +1051,7 @@ int16x2_t test_shadd16(int16x2_t a, int16x2_t b) {
// AArch32-LABEL: @test_shasx(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.shasx(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.shasx(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
int16x2_t test_shasx(int16x2_t a, int16x2_t b) {
@@ -1149,7 +1060,7 @@ int16x2_t test_shasx(int16x2_t a, int16x2_t b) {
// AArch32-LABEL: @test_shsax(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.shsax(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.shsax(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
int16x2_t test_shsax(int16x2_t a, int16x2_t b) {
@@ -1158,7 +1069,7 @@ int16x2_t test_shsax(int16x2_t a, int16x2_t b) {
// AArch32-LABEL: @test_shsub16(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.shsub16(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.shsub16(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
int16x2_t test_shsub16(int16x2_t a, int16x2_t b) {
@@ -1167,7 +1078,7 @@ int16x2_t test_shsub16(int16x2_t a, int16x2_t b) {
// AArch32-LABEL: @test_ssax(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.ssax(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.ssax(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
int16x2_t test_ssax(int16x2_t a, int16x2_t b) {
@@ -1176,7 +1087,7 @@ int16x2_t test_ssax(int16x2_t a, int16x2_t b) {
// AArch32-LABEL: @test_ssub16(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.ssub16(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.ssub16(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
int16x2_t test_ssub16(int16x2_t a, int16x2_t b) {
@@ -1185,7 +1096,7 @@ int16x2_t test_ssub16(int16x2_t a, int16x2_t b) {
// AArch32-LABEL: @test_uadd16(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uadd16(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uadd16(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
uint16x2_t test_uadd16(uint16x2_t a, uint16x2_t b) {
@@ -1194,7 +1105,7 @@ uint16x2_t test_uadd16(uint16x2_t a, uint16x2_t b) {
// AArch32-LABEL: @test_uasx(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uasx(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uasx(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
uint16x2_t test_uasx(uint16x2_t a, uint16x2_t b) {
@@ -1203,7 +1114,7 @@ uint16x2_t test_uasx(uint16x2_t a, uint16x2_t b) {
// AArch32-LABEL: @test_uhadd16(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uhadd16(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uhadd16(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
uint16x2_t test_uhadd16(uint16x2_t a, uint16x2_t b) {
@@ -1212,7 +1123,7 @@ uint16x2_t test_uhadd16(uint16x2_t a, uint16x2_t b) {
// AArch32-LABEL: @test_uhasx(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uhasx(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uhasx(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
uint16x2_t test_uhasx(uint16x2_t a, uint16x2_t b) {
@@ -1221,7 +1132,7 @@ uint16x2_t test_uhasx(uint16x2_t a, uint16x2_t b) {
// AArch32-LABEL: @test_uhsax(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uhsax(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uhsax(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
uint16x2_t test_uhsax(uint16x2_t a, uint16x2_t b) {
@@ -1230,7 +1141,7 @@ uint16x2_t test_uhsax(uint16x2_t a, uint16x2_t b) {
// AArch32-LABEL: @test_uhsub16(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uhsub16(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uhsub16(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
uint16x2_t test_uhsub16(uint16x2_t a, uint16x2_t b) {
@@ -1239,7 +1150,7 @@ uint16x2_t test_uhsub16(uint16x2_t a, uint16x2_t b) {
// AArch32-LABEL: @test_uqadd16(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uqadd16(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uqadd16(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
uint16x2_t test_uqadd16(uint16x2_t a, uint16x2_t b) {
@@ -1248,7 +1159,7 @@ uint16x2_t test_uqadd16(uint16x2_t a, uint16x2_t b) {
// AArch32-LABEL: @test_uqasx(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uqasx(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uqasx(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
uint16x2_t test_uqasx(uint16x2_t a, uint16x2_t b) {
@@ -1257,7 +1168,7 @@ uint16x2_t test_uqasx(uint16x2_t a, uint16x2_t b) {
// AArch32-LABEL: @test_uqsax(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uqsax(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uqsax(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
uint16x2_t test_uqsax(uint16x2_t a, uint16x2_t b) {
@@ -1266,7 +1177,7 @@ uint16x2_t test_uqsax(uint16x2_t a, uint16x2_t b) {
// AArch32-LABEL: @test_uqsub16(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uqsub16(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.uqsub16(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
uint16x2_t test_uqsub16(uint16x2_t a, uint16x2_t b) {
@@ -1275,7 +1186,7 @@ uint16x2_t test_uqsub16(uint16x2_t a, uint16x2_t b) {
// AArch32-LABEL: @test_usax(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.usax(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.usax(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
uint16x2_t test_usax(uint16x2_t a, uint16x2_t b) {
@@ -1284,7 +1195,7 @@ uint16x2_t test_usax(uint16x2_t a, uint16x2_t b) {
// AArch32-LABEL: @test_usub16(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.usub16(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.usub16(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
uint16x2_t test_usub16(uint16x2_t a, uint16x2_t b) {
@@ -1296,7 +1207,7 @@ uint16x2_t test_usub16(uint16x2_t a, uint16x2_t b) {
#if __ARM_FEATURE_SIMD32
// AArch32-LABEL: @test_smlad(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smlad(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smlad(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
int32_t test_smlad(int16x2_t a, int16x2_t b, int32_t c) {
@@ -1305,7 +1216,7 @@ int32_t test_smlad(int16x2_t a, int16x2_t b, int32_t c) {
// AArch32-LABEL: @test_smladx(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smladx(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smladx(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
int32_t test_smladx(int16x2_t a, int16x2_t b, int32_t c) {
@@ -1314,7 +1225,7 @@ int32_t test_smladx(int16x2_t a, int16x2_t b, int32_t c) {
// AArch32-LABEL: @test_smlald(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i64 @llvm.arm.smlald(i32 [[A:%.*]], i32 [[B:%.*]], i64 [[C:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i64 @llvm.arm.smlald(i32 [[A:%.*]], i32 [[B:%.*]], i64 [[C:%.*]])
// AArch32-NEXT: ret i64 [[TMP0]]
//
int64_t test_smlald(int16x2_t a, int16x2_t b, int64_t c) {
@@ -1323,7 +1234,7 @@ int64_t test_smlald(int16x2_t a, int16x2_t b, int64_t c) {
// AArch32-LABEL: @test_smlaldx(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i64 @llvm.arm.smlaldx(i32 [[A:%.*]], i32 [[B:%.*]], i64 [[C:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i64 @llvm.arm.smlaldx(i32 [[A:%.*]], i32 [[B:%.*]], i64 [[C:%.*]])
// AArch32-NEXT: ret i64 [[TMP0]]
//
int64_t test_smlaldx(int16x2_t a, int16x2_t b, int64_t c) {
@@ -1332,7 +1243,7 @@ int64_t test_smlaldx(int16x2_t a, int16x2_t b, int64_t c) {
// AArch32-LABEL: @test_smlsd(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smlsd(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smlsd(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
int32_t test_smlsd(int16x2_t a, int16x2_t b, int32_t c) {
@@ -1341,7 +1252,7 @@ int32_t test_smlsd(int16x2_t a, int16x2_t b, int32_t c) {
// AArch32-LABEL: @test_smlsdx(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smlsdx(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smlsdx(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
int32_t test_smlsdx(int16x2_t a, int16x2_t b, int32_t c) {
@@ -1350,7 +1261,7 @@ int32_t test_smlsdx(int16x2_t a, int16x2_t b, int32_t c) {
// AArch32-LABEL: @test_smlsld(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i64 @llvm.arm.smlsld(i32 [[A:%.*]], i32 [[B:%.*]], i64 [[C:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i64 @llvm.arm.smlsld(i32 [[A:%.*]], i32 [[B:%.*]], i64 [[C:%.*]])
// AArch32-NEXT: ret i64 [[TMP0]]
//
int64_t test_smlsld(int16x2_t a, int16x2_t b, int64_t c) {
@@ -1359,7 +1270,7 @@ int64_t test_smlsld(int16x2_t a, int16x2_t b, int64_t c) {
// AArch32-LABEL: @test_smlsldx(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i64 @llvm.arm.smlsldx(i32 [[A:%.*]], i32 [[B:%.*]], i64 [[C:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i64 @llvm.arm.smlsldx(i32 [[A:%.*]], i32 [[B:%.*]], i64 [[C:%.*]])
// AArch32-NEXT: ret i64 [[TMP0]]
//
int64_t test_smlsldx(int16x2_t a, int16x2_t b, int64_t c) {
@@ -1368,7 +1279,7 @@ int64_t test_smlsldx(int16x2_t a, int16x2_t b, int64_t c) {
// AArch32-LABEL: @test_smuad(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smuad(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smuad(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
int32_t test_smuad(int16x2_t a, int16x2_t b) {
@@ -1377,7 +1288,7 @@ int32_t test_smuad(int16x2_t a, int16x2_t b) {
// AArch32-LABEL: @test_smuadx(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smuadx(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smuadx(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
int32_t test_smuadx(int16x2_t a, int16x2_t b) {
@@ -1386,7 +1297,7 @@ int32_t test_smuadx(int16x2_t a, int16x2_t b) {
// AArch32-LABEL: @test_smusd(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smusd(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smusd(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
int32_t test_smusd(int16x2_t a, int16x2_t b) {
@@ -1395,7 +1306,7 @@ int32_t test_smusd(int16x2_t a, int16x2_t b) {
// AArch32-LABEL: @test_smusdx(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smusdx(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.smusdx(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
int32_t test_smusdx(int16x2_t a, int16x2_t b) {
@@ -1407,13 +1318,13 @@ int32_t test_smusdx(int16x2_t a, int16x2_t b) {
// AArch32-LABEL: @test_crc32b(
// AArch32-NEXT: entry:
// AArch32-NEXT: [[TMP0:%.*]] = zext i8 [[B:%.*]] to i32
-// AArch32-NEXT: [[TMP1:%.*]] = call i32 @llvm.arm.crc32b(i32 [[A:%.*]], i32 [[TMP0]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP1:%.*]] = call i32 @llvm.arm.crc32b(i32 [[A:%.*]], i32 [[TMP0]])
// AArch32-NEXT: ret i32 [[TMP1]]
//
// AArch64-LABEL: @test_crc32b(
// AArch64-NEXT: entry:
// AArch64-NEXT: [[TMP0:%.*]] = zext i8 [[B:%.*]] to i32
-// AArch64-NEXT: [[TMP1:%.*]] = call i32 @llvm.aarch64.crc32b(i32 [[A:%.*]], i32 [[TMP0]]) [[ATTR3]]
+// AArch64-NEXT: [[TMP1:%.*]] = call i32 @llvm.aarch64.crc32b(i32 [[A:%.*]], i32 [[TMP0]])
// AArch64-NEXT: ret i32 [[TMP1]]
//
uint32_t test_crc32b(uint32_t a, uint8_t b) {
@@ -1423,13 +1334,13 @@ uint32_t test_crc32b(uint32_t a, uint8_t b) {
// AArch32-LABEL: @test_crc32h(
// AArch32-NEXT: entry:
// AArch32-NEXT: [[TMP0:%.*]] = zext i16 [[B:%.*]] to i32
-// AArch32-NEXT: [[TMP1:%.*]] = call i32 @llvm.arm.crc32h(i32 [[A:%.*]], i32 [[TMP0]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP1:%.*]] = call i32 @llvm.arm.crc32h(i32 [[A:%.*]], i32 [[TMP0]])
// AArch32-NEXT: ret i32 [[TMP1]]
//
// AArch64-LABEL: @test_crc32h(
// AArch64-NEXT: entry:
// AArch64-NEXT: [[TMP0:%.*]] = zext i16 [[B:%.*]] to i32
-// AArch64-NEXT: [[TMP1:%.*]] = call i32 @llvm.aarch64.crc32h(i32 [[A:%.*]], i32 [[TMP0]]) [[ATTR3]]
+// AArch64-NEXT: [[TMP1:%.*]] = call i32 @llvm.aarch64.crc32h(i32 [[A:%.*]], i32 [[TMP0]])
// AArch64-NEXT: ret i32 [[TMP1]]
//
uint32_t test_crc32h(uint32_t a, uint16_t b) {
@@ -1438,12 +1349,12 @@ uint32_t test_crc32h(uint32_t a, uint16_t b) {
// AArch32-LABEL: @test_crc32w(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.crc32w(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.crc32w(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
// AArch64-LABEL: @test_crc32w(
// AArch64-NEXT: entry:
-// AArch64-NEXT: [[TMP0:%.*]] = call i32 @llvm.aarch64.crc32w(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR3]]
+// AArch64-NEXT: [[TMP0:%.*]] = call i32 @llvm.aarch64.crc32w(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch64-NEXT: ret i32 [[TMP0]]
//
uint32_t test_crc32w(uint32_t a, uint32_t b) {
@@ -1455,13 +1366,13 @@ uint32_t test_crc32w(uint32_t a, uint32_t b) {
// AArch32-NEXT: [[TMP0:%.*]] = trunc i64 [[B:%.*]] to i32
// AArch32-NEXT: [[TMP1:%.*]] = lshr i64 [[B]], 32
// AArch32-NEXT: [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32
-// AArch32-NEXT: [[TMP3:%.*]] = call i32 @llvm.arm.crc32w(i32 [[A:%.*]], i32 [[TMP0]]) [[ATTR1]]
-// AArch32-NEXT: [[TMP4:%.*]] = call i32 @llvm.arm.crc32w(i32 [[TMP3]], i32 [[TMP2]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP3:%.*]] = call i32 @llvm.arm.crc32w(i32 [[A:%.*]], i32 [[TMP0]])
+// AArch32-NEXT: [[TMP4:%.*]] = call i32 @llvm.arm.crc32w(i32 [[TMP3]], i32 [[TMP2]])
// AArch32-NEXT: ret i32 [[TMP4]]
//
// AArch64-LABEL: @test_crc32d(
// AArch64-NEXT: entry:
-// AArch64-NEXT: [[TMP0:%.*]] = call i32 @llvm.aarch64.crc32x(i32 [[A:%.*]], i64 [[B:%.*]]) [[ATTR3]]
+// AArch64-NEXT: [[TMP0:%.*]] = call i32 @llvm.aarch64.crc32x(i32 [[A:%.*]], i64 [[B:%.*]])
// AArch64-NEXT: ret i32 [[TMP0]]
//
uint32_t test_crc32d(uint32_t a, uint64_t b) {
@@ -1471,13 +1382,13 @@ uint32_t test_crc32d(uint32_t a, uint64_t b) {
// AArch32-LABEL: @test_crc32cb(
// AArch32-NEXT: entry:
// AArch32-NEXT: [[TMP0:%.*]] = zext i8 [[B:%.*]] to i32
-// AArch32-NEXT: [[TMP1:%.*]] = call i32 @llvm.arm.crc32cb(i32 [[A:%.*]], i32 [[TMP0]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP1:%.*]] = call i32 @llvm.arm.crc32cb(i32 [[A:%.*]], i32 [[TMP0]])
// AArch32-NEXT: ret i32 [[TMP1]]
//
// AArch64-LABEL: @test_crc32cb(
// AArch64-NEXT: entry:
// AArch64-NEXT: [[TMP0:%.*]] = zext i8 [[B:%.*]] to i32
-// AArch64-NEXT: [[TMP1:%.*]] = call i32 @llvm.aarch64.crc32cb(i32 [[A:%.*]], i32 [[TMP0]]) [[ATTR3]]
+// AArch64-NEXT: [[TMP1:%.*]] = call i32 @llvm.aarch64.crc32cb(i32 [[A:%.*]], i32 [[TMP0]])
// AArch64-NEXT: ret i32 [[TMP1]]
//
uint32_t test_crc32cb(uint32_t a, uint8_t b) {
@@ -1487,13 +1398,13 @@ uint32_t test_crc32cb(uint32_t a, uint8_t b) {
// AArch32-LABEL: @test_crc32ch(
// AArch32-NEXT: entry:
// AArch32-NEXT: [[TMP0:%.*]] = zext i16 [[B:%.*]] to i32
-// AArch32-NEXT: [[TMP1:%.*]] = call i32 @llvm.arm.crc32ch(i32 [[A:%.*]], i32 [[TMP0]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP1:%.*]] = call i32 @llvm.arm.crc32ch(i32 [[A:%.*]], i32 [[TMP0]])
// AArch32-NEXT: ret i32 [[TMP1]]
//
// AArch64-LABEL: @test_crc32ch(
// AArch64-NEXT: entry:
// AArch64-NEXT: [[TMP0:%.*]] = zext i16 [[B:%.*]] to i32
-// AArch64-NEXT: [[TMP1:%.*]] = call i32 @llvm.aarch64.crc32ch(i32 [[A:%.*]], i32 [[TMP0]]) [[ATTR3]]
+// AArch64-NEXT: [[TMP1:%.*]] = call i32 @llvm.aarch64.crc32ch(i32 [[A:%.*]], i32 [[TMP0]])
// AArch64-NEXT: ret i32 [[TMP1]]
//
uint32_t test_crc32ch(uint32_t a, uint16_t b) {
@@ -1502,12 +1413,12 @@ uint32_t test_crc32ch(uint32_t a, uint16_t b) {
// AArch32-LABEL: @test_crc32cw(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.crc32cw(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.crc32cw(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
// AArch64-LABEL: @test_crc32cw(
// AArch64-NEXT: entry:
-// AArch64-NEXT: [[TMP0:%.*]] = call i32 @llvm.aarch64.crc32cw(i32 [[A:%.*]], i32 [[B:%.*]]) [[ATTR3]]
+// AArch64-NEXT: [[TMP0:%.*]] = call i32 @llvm.aarch64.crc32cw(i32 [[A:%.*]], i32 [[B:%.*]])
// AArch64-NEXT: ret i32 [[TMP0]]
//
uint32_t test_crc32cw(uint32_t a, uint32_t b) {
@@ -1519,13 +1430,13 @@ uint32_t test_crc32cw(uint32_t a, uint32_t b) {
// AArch32-NEXT: [[TMP0:%.*]] = trunc i64 [[B:%.*]] to i32
// AArch32-NEXT: [[TMP1:%.*]] = lshr i64 [[B]], 32
// AArch32-NEXT: [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32
-// AArch32-NEXT: [[TMP3:%.*]] = call i32 @llvm.arm.crc32cw(i32 [[A:%.*]], i32 [[TMP0]]) [[ATTR1]]
-// AArch32-NEXT: [[TMP4:%.*]] = call i32 @llvm.arm.crc32cw(i32 [[TMP3]], i32 [[TMP2]]) [[ATTR1]]
+// AArch32-NEXT: [[TMP3:%.*]] = call i32 @llvm.arm.crc32cw(i32 [[A:%.*]], i32 [[TMP0]])
+// AArch32-NEXT: [[TMP4:%.*]] = call i32 @llvm.arm.crc32cw(i32 [[TMP3]], i32 [[TMP2]])
// AArch32-NEXT: ret i32 [[TMP4]]
//
// AArch64-LABEL: @test_crc32cd(
// AArch64-NEXT: entry:
-// AArch64-NEXT: [[TMP0:%.*]] = call i32 @llvm.aarch64.crc32cx(i32 [[A:%.*]], i64 [[B:%.*]]) [[ATTR3]]
+// AArch64-NEXT: [[TMP0:%.*]] = call i32 @llvm.aarch64.crc32cx(i32 [[A:%.*]], i64 [[B:%.*]])
// AArch64-NEXT: ret i32 [[TMP0]]
//
uint32_t test_crc32cd(uint32_t a, uint64_t b) {
@@ -1535,12 +1446,12 @@ uint32_t test_crc32cd(uint32_t a, uint64_t b) {
/* 10.1 Special register intrinsics */
// AArch32-LABEL: @test_rsr(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.read_volatile_register.i32(metadata [[A32RSR32:!.*]])
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.read_volatile_register.i32(metadata [[META9:![0-9]+]])
// AArch32-NEXT: ret i32 [[TMP0]]
//
// AArch64-LABEL: @test_rsr(
// AArch64-NEXT: entry:
-// AArch64-NEXT: [[TMP0:%.*]] = call i64 @llvm.read_volatile_register.i64(metadata [[A64RSR:!.*]])
+// AArch64-NEXT: [[TMP0:%.*]] = call i64 @llvm.read_volatile_register.i64(metadata [[META8:![0-9]+]])
// AArch64-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
// AArch64-NEXT: ret i32 [[TMP1]]
//
@@ -1554,12 +1465,12 @@ uint32_t test_rsr() {
// AArch32-LABEL: @test_rsr64(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i64 @llvm.read_volatile_register.i64(metadata [[A32RSR64:!.*]])
+// AArch32-NEXT: [[TMP0:%.*]] = call i64 @llvm.read_volatile_register.i64(metadata [[META10:![0-9]+]])
// AArch32-NEXT: ret i64 [[TMP0]]
//
// AArch64-LABEL: @test_rsr64(
// AArch64-NEXT: entry:
-// AArch64-NEXT: [[TMP0:%.*]] = call i64 @llvm.read_volatile_register.i64(metadata [[A64RSR]])
+// AArch64-NEXT: [[TMP0:%.*]] = call i64 @llvm.read_volatile_register.i64(metadata [[META8]])
// AArch64-NEXT: ret i64 [[TMP0]]
//
uint64_t test_rsr64() {
@@ -1572,13 +1483,13 @@ uint64_t test_rsr64() {
// AArch32-LABEL: @test_rsrp(
// AArch32-NEXT: entry:
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.read_volatile_register.i32(metadata [[A32SYSREG:!.*]])
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.read_volatile_register.i32(metadata [[META11:![0-9]+]])
// AArch32-NEXT: [[TMP1:%.*]] = inttoptr i32 [[TMP0]] to i8*
// AArch32-NEXT: ret i8* [[TMP1]]
//
// AArch64-LABEL: @test_rsrp(
// AArch64-NEXT: entry:
-// AArch64-NEXT: [[TMP0:%.*]] = call i64 @llvm.read_volatile_register.i64(metadata [[A64SYSREG:!.*]])
+// AArch64-NEXT: [[TMP0:%.*]] = call i64 @llvm.read_volatile_register.i64(metadata [[META9:![0-9]+]])
// AArch64-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to i8*
// AArch64-NEXT: ret i8* [[TMP1]]
//
@@ -1588,13 +1499,13 @@ void *test_rsrp() {
// AArch32-LABEL: @test_wsr(
// AArch32-NEXT: entry:
-// AArch32-NEXT: call void @llvm.write_register.i32(metadata [[A32RSR32]], i32 [[V:%.*]])
+// AArch32-NEXT: call void @llvm.write_register.i32(metadata [[META9]], i32 [[V:%.*]])
// AArch32-NEXT: ret void
//
// AArch64-LABEL: @test_wsr(
// AArch64-NEXT: entry:
// AArch64-NEXT: [[TMP0:%.*]] = zext i32 [[V:%.*]] to i64
-// AArch64-NEXT: call void @llvm.write_register.i64(metadata [[A64RSR]], i64 [[TMP0]])
+// AArch64-NEXT: call void @llvm.write_register.i64(metadata [[META8]], i64 [[TMP0]])
// AArch64-NEXT: ret void
//
void test_wsr(uint32_t v) {
@@ -1607,12 +1518,12 @@ void test_wsr(uint32_t v) {
// AArch32-LABEL: @test_wsr64(
// AArch32-NEXT: entry:
-// AArch32-NEXT: call void @llvm.write_register.i64(metadata [[A32RSR64]], i64 [[V:%.*]])
+// AArch32-NEXT: call void @llvm.write_register.i64(metadata [[META10]], i64 [[V:%.*]])
// AArch32-NEXT: ret void
//
// AArch64-LABEL: @test_wsr64(
// AArch64-NEXT: entry:
-// AArch64-NEXT: call void @llvm.write_register.i64(metadata [[A64RSR]], i64 [[V:%.*]])
+// AArch64-NEXT: call void @llvm.write_register.i64(metadata [[META8]], i64 [[V:%.*]])
// AArch64-NEXT: ret void
//
void test_wsr64(uint64_t v) {
@@ -1626,13 +1537,13 @@ void test_wsr64(uint64_t v) {
// AArch32-LABEL: @test_wsrp(
// AArch32-NEXT: entry:
// AArch32-NEXT: [[TMP0:%.*]] = ptrtoint i8* [[V:%.*]] to i32
-// AArch32-NEXT: call void @llvm.write_register.i32(metadata [[A32SYSREG]], i32 [[TMP0]])
+// AArch32-NEXT: call void @llvm.write_register.i32(metadata [[META11]], i32 [[TMP0]])
// AArch32-NEXT: ret void
//
// AArch64-LABEL: @test_wsrp(
// AArch64-NEXT: entry:
// AArch64-NEXT: [[TMP0:%.*]] = ptrtoint i8* [[V:%.*]] to i64
-// AArch64-NEXT: call void @llvm.write_register.i64(metadata [[A64SYSREG]], i64 [[TMP0]])
+// AArch64-NEXT: call void @llvm.write_register.i64(metadata [[META9]], i64 [[TMP0]])
// AArch64-NEXT: ret void
//
void test_wsrp(void *v) {
@@ -1642,7 +1553,7 @@ void test_wsrp(void *v) {
// AArch32-LABEL: @test_rsrf(
// AArch32-NEXT: entry:
// AArch32-NEXT: [[REF_TMP:%.*]] = alloca i32, align 4
-// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.read_volatile_register.i32(metadata [[A32RSR32]])
+// AArch32-NEXT: [[TMP0:%.*]] = call i32 @llvm.read_volatile_register.i32(metadata [[META9]])
// AArch32-NEXT: store i32 [[TMP0]], i32* [[REF_TMP]], align 4
// AArch32-NEXT: [[TMP1:%.*]] = bitcast i32* [[REF_TMP]] to float*
// AArch32-NEXT: [[TMP2:%.*]] = load float, float* [[TMP1]], align 4
@@ -1651,7 +1562,7 @@ void test_wsrp(void *v) {
// AArch64-LABEL: @test_rsrf(
// AArch64-NEXT: entry:
// AArch64-NEXT: [[REF_TMP:%.*]] = alloca i32, align 4
-// AArch64-NEXT: [[TMP0:%.*]] = call i64 @llvm.read_volatile_register.i64(metadata [[A64RSR]])
+// AArch64-NEXT: [[TMP0:%.*]] = call i64 @llvm.read_volatile_register.i64(metadata [[META8]])
// AArch64-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
// AArch64-NEXT: store i32 [[TMP1]], i32* [[REF_TMP]], align 4
// AArch64-NEXT: [[TMP2:%.*]] = bitcast i32* [[REF_TMP]] to float*
@@ -1669,7 +1580,7 @@ float test_rsrf() {
// AArch32-LABEL: @test_rsrf64(
// AArch32-NEXT: entry:
// AArch32-NEXT: [[REF_TMP:%.*]] = alloca i64, align 8
-// AArch32-NEXT: [[TMP0:%.*]] = call i64 @llvm.read_volatile_register.i64(metadata [[A32RSR64]])
+// AArch32-NEXT: [[TMP0:%.*]] = call i64 @llvm.read_volatile_register.i64(metadata [[META10]])
// AArch32-NEXT: store i64 [[TMP0]], i64* [[REF_TMP]], align 8
// AArch32-NEXT: [[TMP1:%.*]] = bitcast i64* [[REF_TMP]] to double*
// AArch32-NEXT: [[TMP2:%.*]] = load double, double* [[TMP1]], align 8
@@ -1678,7 +1589,7 @@ float test_rsrf() {
// AArch64-LABEL: @test_rsrf64(
// AArch64-NEXT: entry:
// AArch64-NEXT: [[REF_TMP:%.*]] = alloca i64, align 8
-// AArch64-NEXT: [[TMP0:%.*]] = call i64 @llvm.read_volatile_register.i64(metadata [[A64RSR]])
+// AArch64-NEXT: [[TMP0:%.*]] = call i64 @llvm.read_volatile_register.i64(metadata [[META8]])
// AArch64-NEXT: store i64 [[TMP0]], i64* [[REF_TMP]], align 8
// AArch64-NEXT: [[TMP1:%.*]] = bitcast i64* [[REF_TMP]] to double*
// AArch64-NEXT: [[TMP2:%.*]] = load double, double* [[TMP1]], align 8
@@ -1698,7 +1609,7 @@ double test_rsrf64() {
// AArch32-NEXT: store float [[V:%.*]], float* [[V_ADDR]], align 4
// AArch32-NEXT: [[TMP0:%.*]] = bitcast float* [[V_ADDR]] to i32*
// AArch32-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
-// AArch32-NEXT: call void @llvm.write_register.i32(metadata [[A32RSR32]], i32 [[TMP1]])
+// AArch32-NEXT: call void @llvm.write_register.i32(metadata [[META9]], i32 [[TMP1]])
// AArch32-NEXT: ret void
//
// AArch64-LABEL: @test_wsrf(
@@ -1708,7 +1619,7 @@ double test_rsrf64() {
// AArch64-NEXT: [[TMP0:%.*]] = bitcast float* [[V_ADDR]] to i32*
// AArch64-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
// AArch64-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
-// AArch64-NEXT: call void @llvm.write_register.i64(metadata [[A64RSR]], i64 [[TMP2]])
+// AArch64-NEXT: call void @llvm.write_register.i64(metadata [[META8]], i64 [[TMP2]])
// AArch64-NEXT: ret void
//
void test_wsrf(float v) {
@@ -1725,7 +1636,7 @@ void test_wsrf(float v) {
// AArch32-NEXT: store double [[V:%.*]], double* [[V_ADDR]], align 8
// AArch32-NEXT: [[TMP0:%.*]] = bitcast double* [[V_ADDR]] to i64*
// AArch32-NEXT: [[TMP1:%.*]] = load i64, i64* [[TMP0]], align 8
-// AArch32-NEXT: call void @llvm.write_register.i64(metadata [[A32RSR64]], i64 [[TMP1]])
+// AArch32-NEXT: call void @llvm.write_register.i64(metadata [[META10]], i64 [[TMP1]])
// AArch32-NEXT: ret void
//
// AArch64-LABEL: @test_wsrf64(
@@ -1734,7 +1645,7 @@ void test_wsrf(float v) {
// AArch64-NEXT: store double [[V:%.*]], double* [[V_ADDR]], align 8
// AArch64-NEXT: [[TMP0:%.*]] = bitcast double* [[V_ADDR]] to i64*
// AArch64-NEXT: [[TMP1:%.*]] = load i64, i64* [[TMP0]], align 8
-// AArch64-NEXT: call void @llvm.write_register.i64(metadata [[A64RSR]], i64 [[TMP1]])
+// AArch64-NEXT: call void @llvm.write_register.i64(metadata [[META8]], i64 [[TMP1]])
// AArch64-NEXT: ret void
//
void test_wsrf64(double v) {
@@ -1748,7 +1659,7 @@ void test_wsrf64(double v) {
#ifdef __ARM_64BIT_STATE
// AArch6483-LABEL: @test_jcvt(
// AArch6483-NEXT: entry:
-// AArch6483-NEXT: [[TMP0:%.*]] = call i32 @llvm.aarch64.fjcvtzs(double [[V:%.*]]) [[ATTR3:#.*]]
+// AArch6483-NEXT: [[TMP0:%.*]] = call i32 @llvm.aarch64.fjcvtzs(double [[V:%.*]])
// AArch6483-NEXT: ret i32 [[TMP0]]
//
int32_t test_jcvt(double v) {
@@ -1761,7 +1672,7 @@ int32_t test_jcvt(double v) {
// AArch6485-LABEL: @test_rndr(
// AArch6485-NEXT: entry:
-// AArch6485-NEXT: [[TMP0:%.*]] = call { i64, i1 } @llvm.aarch64.rndr() [[ATTR3:#.*]]
+// AArch6485-NEXT: [[TMP0:%.*]] = call { i64, i1 } @llvm.aarch64.rndr()
// AArch6485-NEXT: [[TMP1:%.*]] = extractvalue { i64, i1 } [[TMP0]], 0
// AArch6485-NEXT: [[TMP2:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1
// AArch6485-NEXT: store i64 [[TMP1]], i64* [[__ADDR:%.*]], align 8
@@ -1774,7 +1685,7 @@ int test_rndr(uint64_t *__addr) {
// AArch6485-LABEL: @test_rndrrs(
// AArch6485-NEXT: entry:
-// AArch6485-NEXT: [[TMP0:%.*]] = call { i64, i1 } @llvm.aarch64.rndrrs() [[ATTR3:#.*]]
+// AArch6485-NEXT: [[TMP0:%.*]] = call { i64, i1 } @llvm.aarch64.rndrrs()
// AArch6485-NEXT: [[TMP1:%.*]] = extractvalue { i64, i1 } [[TMP0]], 0
// AArch6485-NEXT: [[TMP2:%.*]] = extractvalue { i64, i1 } [[TMP0]], 1
// AArch6485-NEXT: store i64 [[TMP1]], i64* [[__ADDR:%.*]], align 8
@@ -1786,9 +1697,4 @@ int test_rndrrs(uint64_t *__addr) {
}
#endif
-// AArch32: [[A32RSR32]] = !{!"cp1:2:c3:c4:5"}
-// AArch32: [[A32RSR64]] = !{!"cp1:2:c3"}
-// AArch32: [[A32SYSREG]] = !{!"sysreg"}
-// AArch64: [[A64RSR]] = !{!"1:2:3:4:5"}
-// AArch64: [[A64SYSREG]] = !{!"sysreg"}
diff --git a/clang/test/CodeGen/memcpy-inline-builtin.c b/clang/test/CodeGen/memcpy-inline-builtin.c
index 050f9a8ba871f..2d0c5792cec36 100644
--- a/clang/test/CodeGen/memcpy-inline-builtin.c
+++ b/clang/test/CodeGen/memcpy-inline-builtin.c
@@ -36,7 +36,7 @@ AVAILABLE_EXTERNALLY void *memcpy(void *a, const void *b, size_t c) {
// CHECK-NEXT: [[TMP3:%.*]] = load i8*, i8** [[A_ADDR_I]], align 8
// CHECK-NEXT: [[TMP4:%.*]] = load i8*, i8** [[B_ADDR_I]], align 8
// CHECK-NEXT: [[TMP5:%.*]] = load i64, i64* [[C_ADDR_I]], align 8
-// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 [[TMP3]], i8* align 1 [[TMP4]], i64 [[TMP5]], i1 false) #[[ATTR3]]
+// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 [[TMP3]], i8* align 1 [[TMP4]], i64 [[TMP5]], i1 false)
// CHECK-NEXT: ret i8* [[TMP3]]
//
void *foo(void *a, const void *b, size_t c) {
diff --git a/clang/test/Headers/wasm.c b/clang/test/Headers/wasm.c
index d20211eefec1f..53acbf4de4c96 100644
--- a/clang/test/Headers/wasm.c
+++ b/clang/test/Headers/wasm.c
@@ -1522,7 +1522,7 @@ v128_t test_v128_andnot(v128_t a, v128_t b) {
// CHECK-LABEL: @test_v128_any_true(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.wasm.anytrue.v16i8(<16 x i8> [[TMP0]]) #[[ATTR7:[0-9]+]]
+// CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.wasm.anytrue.v16i8(<16 x i8> [[TMP0]])
// CHECK-NEXT: [[TOBOOL_I:%.*]] = icmp ne i32 [[TMP1]], 0
// CHECK-NEXT: ret i1 [[TOBOOL_I]]
//
@@ -1532,7 +1532,7 @@ bool test_v128_any_true(v128_t a) {
// CHECK-LABEL: @test_v128_bitselect(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.wasm.bitselect.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[MASK:%.*]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.wasm.bitselect.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[MASK:%.*]])
// CHECK-NEXT: ret <4 x i32> [[TMP0]]
//
v128_t test_v128_bitselect(v128_t a, v128_t b, v128_t mask) {
@@ -1542,7 +1542,7 @@ v128_t test_v128_bitselect(v128_t a, v128_t b, v128_t mask) {
// CHECK-LABEL: @test_i8x16_abs(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = tail call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP0]], i1 false) #[[ATTR7]]
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP0]], i1 false)
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP2]]
//
@@ -1564,7 +1564,7 @@ v128_t test_i8x16_neg(v128_t a) {
// CHECK-LABEL: @test_i8x16_all_true(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.wasm.alltrue.v16i8(<16 x i8> [[TMP0]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.wasm.alltrue.v16i8(<16 x i8> [[TMP0]])
// CHECK-NEXT: [[TOBOOL_I:%.*]] = icmp ne i32 [[TMP1]], 0
// CHECK-NEXT: ret i1 [[TOBOOL_I]]
//
@@ -1575,7 +1575,7 @@ bool test_i8x16_all_true(v128_t a) {
// CHECK-LABEL: @test_i8x16_bitmask(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.wasm.bitmask.v16i8(<16 x i8> [[TMP0]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.wasm.bitmask.v16i8(<16 x i8> [[TMP0]])
// CHECK-NEXT: ret i32 [[TMP1]]
//
uint32_t test_i8x16_bitmask(v128_t a) {
@@ -1585,7 +1585,7 @@ uint32_t test_i8x16_bitmask(v128_t a) {
// CHECK-LABEL: @test_i8x16_popcnt(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> [[TMP0]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> [[TMP0]])
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP2]]
//
@@ -1651,7 +1651,7 @@ v128_t test_i8x16_add(v128_t a, v128_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP3]]
//
@@ -1663,7 +1663,7 @@ v128_t test_i8x16_add_sat(v128_t a, v128_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP3]]
//
@@ -1687,7 +1687,7 @@ v128_t test_i8x16_sub(v128_t a, v128_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.sub.sat.signed.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.sub.sat.signed.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP3]]
//
@@ -1699,7 +1699,7 @@ v128_t test_i8x16_sub_sat(v128_t a, v128_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.sub.sat.unsigned.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.sub.sat.unsigned.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP3]]
//
@@ -1711,7 +1711,7 @@ v128_t test_u8x16_sub_sat(v128_t a, v128_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP3]]
//
@@ -1723,7 +1723,7 @@ v128_t test_i8x16_min(v128_t a, v128_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP3]]
//
@@ -1735,7 +1735,7 @@ v128_t test_u8x16_min(v128_t a, v128_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP3]]
//
@@ -1747,7 +1747,7 @@ v128_t test_i8x16_max(v128_t a, v128_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP3]]
//
@@ -1759,7 +1759,7 @@ v128_t test_u8x16_max(v128_t a, v128_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.avgr.unsigned.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.avgr.unsigned.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP3]]
//
@@ -1770,7 +1770,7 @@ v128_t test_u8x16_avgr(v128_t a, v128_t b) {
// CHECK-LABEL: @test_i16x8_abs(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
-// CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP0]], i1 false) #[[ATTR7]]
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP0]], i1 false)
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP2]]
//
@@ -1792,7 +1792,7 @@ v128_t test_i16x8_neg(v128_t a) {
// CHECK-LABEL: @test_i16x8_all_true(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
-// CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.wasm.alltrue.v8i16(<8 x i16> [[TMP0]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.wasm.alltrue.v8i16(<8 x i16> [[TMP0]])
// CHECK-NEXT: [[TOBOOL_I:%.*]] = icmp ne i32 [[TMP1]], 0
// CHECK-NEXT: ret i1 [[TOBOOL_I]]
//
@@ -1803,7 +1803,7 @@ bool test_i16x8_all_true(v128_t a) {
// CHECK-LABEL: @test_i16x8_bitmask(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
-// CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.wasm.bitmask.v8i16(<8 x i16> [[TMP0]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.wasm.bitmask.v8i16(<8 x i16> [[TMP0]])
// CHECK-NEXT: ret i32 [[TMP1]]
//
uint32_t test_i16x8_bitmask(v128_t a) {
@@ -1868,7 +1868,7 @@ v128_t test_i16x8_add(v128_t a, v128_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP3]]
//
@@ -1880,7 +1880,7 @@ v128_t test_i16x8_add_sat(v128_t a, v128_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP3]]
//
@@ -1904,7 +1904,7 @@ v128_t test_i16x8_sub(v128_t a, v128_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.wasm.sub.sat.signed.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.wasm.sub.sat.signed.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP3]]
//
@@ -1916,7 +1916,7 @@ v128_t test_i16x8_sub_sat(v128_t a, v128_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.wasm.sub.sat.unsigned.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.wasm.sub.sat.unsigned.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP3]]
//
@@ -1940,7 +1940,7 @@ v128_t test_i16x8_mul(v128_t a, v128_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP3]]
//
@@ -1952,7 +1952,7 @@ v128_t test_i16x8_min(v128_t a, v128_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP3]]
//
@@ -1964,7 +1964,7 @@ v128_t test_u16x8_min(v128_t a, v128_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP3]]
//
@@ -1976,7 +1976,7 @@ v128_t test_i16x8_max(v128_t a, v128_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP3]]
//
@@ -1988,7 +1988,7 @@ v128_t test_u16x8_max(v128_t a, v128_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.wasm.avgr.unsigned.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.wasm.avgr.unsigned.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP3]]
//
@@ -1998,7 +1998,7 @@ v128_t test_u16x8_avgr(v128_t a, v128_t b) {
// CHECK-LABEL: @test_i32x4_abs(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[A:%.*]], i1 false) #[[ATTR7]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[A:%.*]], i1 false)
// CHECK-NEXT: ret <4 x i32> [[TMP0]]
//
v128_t test_i32x4_abs(v128_t a) {
@@ -2016,7 +2016,7 @@ v128_t test_i32x4_neg(v128_t a) {
// CHECK-LABEL: @test_i32x4_all_true(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.wasm.alltrue.v4i32(<4 x i32> [[A:%.*]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.wasm.alltrue.v4i32(<4 x i32> [[A:%.*]])
// CHECK-NEXT: [[TOBOOL_I:%.*]] = icmp ne i32 [[TMP0]], 0
// CHECK-NEXT: ret i1 [[TOBOOL_I]]
//
@@ -2026,7 +2026,7 @@ bool test_i32x4_all_true(v128_t a) {
// CHECK-LABEL: @test_i32x4_bitmask(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.wasm.bitmask.v4i32(<4 x i32> [[A:%.*]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.wasm.bitmask.v4i32(<4 x i32> [[A:%.*]])
// CHECK-NEXT: ret i32 [[TMP0]]
//
uint32_t test_i32x4_bitmask(v128_t a) {
@@ -2095,7 +2095,7 @@ v128_t test_i32x4_mul(v128_t a, v128_t b) {
// CHECK-LABEL: @test_i32x4_min(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
// CHECK-NEXT: ret <4 x i32> [[TMP0]]
//
v128_t test_i32x4_min(v128_t a, v128_t b) {
@@ -2104,7 +2104,7 @@ v128_t test_i32x4_min(v128_t a, v128_t b) {
// CHECK-LABEL: @test_u32x4_min(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
// CHECK-NEXT: ret <4 x i32> [[TMP0]]
//
v128_t test_u32x4_min(v128_t a, v128_t b) {
@@ -2113,7 +2113,7 @@ v128_t test_u32x4_min(v128_t a, v128_t b) {
// CHECK-LABEL: @test_i32x4_max(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
// CHECK-NEXT: ret <4 x i32> [[TMP0]]
//
v128_t test_i32x4_max(v128_t a, v128_t b) {
@@ -2122,7 +2122,7 @@ v128_t test_i32x4_max(v128_t a, v128_t b) {
// CHECK-LABEL: @test_u32x4_max(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
// CHECK-NEXT: ret <4 x i32> [[TMP0]]
//
v128_t test_u32x4_max(v128_t a, v128_t b) {
@@ -2133,7 +2133,7 @@ v128_t test_u32x4_max(v128_t a, v128_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i32> @llvm.wasm.dot(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x i32> @llvm.wasm.dot(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
// CHECK-NEXT: ret <4 x i32> [[TMP2]]
//
v128_t test_i32x4_dot_i16x8(v128_t a, v128_t b) {
@@ -2143,7 +2143,7 @@ v128_t test_i32x4_dot_i16x8(v128_t a, v128_t b) {
// CHECK-LABEL: @test_i64x2_abs(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64>
-// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP0]], i1 false) #[[ATTR7]]
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP0]], i1 false)
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP2]]
//
@@ -2165,7 +2165,7 @@ v128_t test_i64x2_neg(v128_t a) {
// CHECK-LABEL: @test_i64x2_all_true(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64>
-// CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.wasm.alltrue.v2i64(<2 x i64> [[TMP0]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.wasm.alltrue.v2i64(<2 x i64> [[TMP0]])
// CHECK-NEXT: [[TOBOOL_I:%.*]] = icmp ne i32 [[TMP1]], 0
// CHECK-NEXT: ret i1 [[TOBOOL_I]]
//
@@ -2176,7 +2176,7 @@ bool test_i64x2_all_true(v128_t a) {
// CHECK-LABEL: @test_i64x2_bitmask(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64>
-// CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.wasm.bitmask.v2i64(<2 x i64> [[TMP0]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.wasm.bitmask.v2i64(<2 x i64> [[TMP0]])
// CHECK-NEXT: ret i32 [[TMP1]]
//
uint32_t test_i64x2_bitmask(v128_t a) {
@@ -2264,7 +2264,7 @@ v128_t test_i64x2_mul(v128_t a, v128_t b) {
// CHECK-LABEL: @test_f32x4_abs(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
-// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP0]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP0]])
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP2]]
//
@@ -2286,7 +2286,7 @@ v128_t test_f32x4_neg(v128_t a) {
// CHECK-LABEL: @test_f32x4_sqrt(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
-// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP0]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP0]])
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP2]]
//
@@ -2297,7 +2297,7 @@ v128_t test_f32x4_sqrt(v128_t a) {
// CHECK-LABEL: @test_f32x4_ceil(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
-// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP0]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP0]])
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP2]]
//
@@ -2308,7 +2308,7 @@ v128_t test_f32x4_ceil(v128_t a) {
// CHECK-LABEL: @test_f32x4_floor(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
-// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP0]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP0]])
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP2]]
//
@@ -2319,7 +2319,7 @@ v128_t test_f32x4_floor(v128_t a) {
// CHECK-LABEL: @test_f32x4_trunc(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
-// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP0]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP0]])
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP2]]
//
@@ -2330,7 +2330,7 @@ v128_t test_f32x4_trunc(v128_t a) {
// CHECK-LABEL: @test_f32x4_nearest(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
-// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP0]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP0]])
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP2]]
//
@@ -2390,7 +2390,7 @@ v128_t test_f32x4_div(v128_t a, v128_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <4 x float>
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x float> @llvm.minimum.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x float> @llvm.minimum.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP3]]
//
@@ -2402,7 +2402,7 @@ v128_t test_f32x4_min(v128_t a, v128_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <4 x float>
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x float> @llvm.maximum.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x float> @llvm.maximum.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP3]]
//
@@ -2414,7 +2414,7 @@ v128_t test_f32x4_max(v128_t a, v128_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <4 x float>
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x float> @llvm.wasm.pmin.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x float> @llvm.wasm.pmin.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP3]]
//
@@ -2426,7 +2426,7 @@ v128_t test_f32x4_pmin(v128_t a, v128_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <4 x float>
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x float> @llvm.wasm.pmax.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x float> @llvm.wasm.pmax.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP3]]
//
@@ -2437,7 +2437,7 @@ v128_t test_f32x4_pmax(v128_t a, v128_t b) {
// CHECK-LABEL: @test_f64x2_abs(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
-// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP0]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP0]])
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP2]]
//
@@ -2459,7 +2459,7 @@ v128_t test_f64x2_neg(v128_t a) {
// CHECK-LABEL: @test_f64x2_sqrt(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
-// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP0]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP0]])
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP2]]
//
@@ -2470,7 +2470,7 @@ v128_t test_f64x2_sqrt(v128_t a) {
// CHECK-LABEL: @test_f64x2_ceil(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
-// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP0]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP0]])
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP2]]
//
@@ -2481,7 +2481,7 @@ v128_t test_f64x2_ceil(v128_t a) {
// CHECK-LABEL: @test_f64x2_floor(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
-// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP0]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP0]])
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP2]]
//
@@ -2492,7 +2492,7 @@ v128_t test_f64x2_floor(v128_t a) {
// CHECK-LABEL: @test_f64x2_trunc(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
-// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP0]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP0]])
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP2]]
//
@@ -2503,7 +2503,7 @@ v128_t test_f64x2_trunc(v128_t a) {
// CHECK-LABEL: @test_f64x2_nearest(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
-// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP0]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP0]])
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP2]]
//
@@ -2563,7 +2563,7 @@ v128_t test_f64x2_div(v128_t a, v128_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <2 x double>
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <2 x double> @llvm.minimum.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <2 x double> @llvm.minimum.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP3]]
//
@@ -2575,7 +2575,7 @@ v128_t test_f64x2_min(v128_t a, v128_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <2 x double>
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <2 x double> @llvm.maximum.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <2 x double> @llvm.maximum.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP3]]
//
@@ -2587,7 +2587,7 @@ v128_t test_f64x2_max(v128_t a, v128_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <2 x double>
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <2 x double> @llvm.wasm.pmin.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <2 x double> @llvm.wasm.pmin.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP3]]
//
@@ -2599,7 +2599,7 @@ v128_t test_f64x2_pmin(v128_t a, v128_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <2 x double>
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <2 x double> @llvm.wasm.pmax.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <2 x double> @llvm.wasm.pmax.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP3]]
//
@@ -2610,7 +2610,7 @@ v128_t test_f64x2_pmax(v128_t a, v128_t b) {
// CHECK-LABEL: @test_i32x4_trunc_sat_f32x4(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
-// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i32> @llvm.fptosi.sat.v4i32.v4f32(<4 x float> [[TMP0]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i32> @llvm.fptosi.sat.v4i32.v4f32(<4 x float> [[TMP0]])
// CHECK-NEXT: ret <4 x i32> [[TMP1]]
//
v128_t test_i32x4_trunc_sat_f32x4(v128_t a) {
@@ -2620,7 +2620,7 @@ v128_t test_i32x4_trunc_sat_f32x4(v128_t a) {
// CHECK-LABEL: @test_u32x4_trunc_sat_f32x4(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
-// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i32> @llvm.fptoui.sat.v4i32.v4f32(<4 x float> [[TMP0]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i32> @llvm.fptoui.sat.v4i32.v4f32(<4 x float> [[TMP0]])
// CHECK-NEXT: ret <4 x i32> [[TMP1]]
//
v128_t test_u32x4_trunc_sat_f32x4(v128_t a) {
@@ -2672,7 +2672,7 @@ v128_t test_f64x2_convert_low_u32x4(v128_t a) {
// CHECK-LABEL: @test_i32x4_trunc_sat_f64x2_zero(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
-// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x i32> @llvm.fptosi.sat.v2i32.v2f64(<2 x double> [[TMP0]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x i32> @llvm.fptosi.sat.v2i32.v2f64(<2 x double> [[TMP0]])
// CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
// CHECK-NEXT: ret <4 x i32> [[TMP2]]
//
@@ -2683,7 +2683,7 @@ v128_t test_i32x4_trunc_sat_f64x2_zero(v128_t a) {
// CHECK-LABEL: @test_u32x4_trunc_sat_f64x2_zero(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
-// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x i32> @llvm.fptoui.sat.v2i32.v2f64(<2 x double> [[TMP0]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x i32> @llvm.fptoui.sat.v2i32.v2f64(<2 x double> [[TMP0]])
// CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
// CHECK-NEXT: ret <4 x i32> [[TMP2]]
//
@@ -2767,7 +2767,7 @@ v128_t test_i64x2_shuffle(v128_t a, v128_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.swizzle(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.swizzle(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP3]]
//
@@ -2779,7 +2779,7 @@ v128_t test_i8x16_swizzle(v128_t a, v128_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.narrow.signed.v16i8.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.narrow.signed.v16i8.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP3]]
//
@@ -2791,7 +2791,7 @@ v128_t test_i8x16_narrow_i16x8(v128_t a, v128_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.narrow.unsigned.v16i8.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.narrow.unsigned.v16i8.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP3]]
//
@@ -2801,7 +2801,7 @@ v128_t test_u8x16_narrow_i16x8(v128_t a, v128_t b) {
// CHECK-LABEL: @test_i16x8_narrow_i32x4(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.wasm.narrow.signed.v8i16.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.wasm.narrow.signed.v8i16.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP1]]
//
@@ -2811,7 +2811,7 @@ v128_t test_i16x8_narrow_i32x4(v128_t a, v128_t b) {
// CHECK-LABEL: @test_u16x8_narrow_i32x4(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.wasm.narrow.unsigned.v8i16.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.wasm.narrow.unsigned.v8i16.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP1]]
//
@@ -2958,7 +2958,7 @@ v128_t test_u64x2_extend_high_u32x4(v128_t a) {
// CHECK-LABEL: @test_i16x8_extadd_pairwise_i8x16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i16> @llvm.wasm.extadd.pairwise.signed.v8i16(<16 x i8> [[TMP0]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i16> @llvm.wasm.extadd.pairwise.signed.v8i16(<16 x i8> [[TMP0]])
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP2]]
//
@@ -2969,7 +2969,7 @@ v128_t test_i16x8_extadd_pairwise_i8x16(v128_t a) {
// CHECK-LABEL: @test_u16x8_extadd_pairwise_u8x16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i16> @llvm.wasm.extadd.pairwise.unsigned.v8i16(<16 x i8> [[TMP0]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x i16> @llvm.wasm.extadd.pairwise.unsigned.v8i16(<16 x i8> [[TMP0]])
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP2]]
//
@@ -2980,7 +2980,7 @@ v128_t test_u16x8_extadd_pairwise_u8x16(v128_t a) {
// CHECK-LABEL: @test_i32x4_extadd_pairwise_i16x8(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
-// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i32> @llvm.wasm.extadd.pairwise.signed.v4i32(<8 x i16> [[TMP0]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i32> @llvm.wasm.extadd.pairwise.signed.v4i32(<8 x i16> [[TMP0]])
// CHECK-NEXT: ret <4 x i32> [[TMP1]]
//
v128_t test_i32x4_extadd_pairwise_i16x8(v128_t a) {
@@ -2990,7 +2990,7 @@ v128_t test_i32x4_extadd_pairwise_i16x8(v128_t a) {
// CHECK-LABEL: @test_u32x4_extadd_pairwise_u16x8(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
-// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i32> @llvm.wasm.extadd.pairwise.unsigned.v4i32(<8 x i16> [[TMP0]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x i32> @llvm.wasm.extadd.pairwise.unsigned.v4i32(<8 x i16> [[TMP0]])
// CHECK-NEXT: ret <4 x i32> [[TMP1]]
//
v128_t test_u32x4_extadd_pairwise_u16x8(v128_t a) {
@@ -3181,7 +3181,7 @@ v128_t test_u64x2_extmul_high_u32x4(v128_t a, v128_t b) {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
-// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.wasm.q15mulr.sat.signed(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.wasm.q15mulr.sat.signed(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
// CHECK-NEXT: ret <4 x i32> [[TMP3]]
//
diff --git a/clang/test/OpenMP/cancel_codegen.cpp b/clang/test/OpenMP/cancel_codegen.cpp
index 9f50145e923a1..467bd2f639a59 100644
--- a/clang/test/OpenMP/cancel_codegen.cpp
+++ b/clang/test/OpenMP/cancel_codegen.cpp
@@ -396,7 +396,7 @@ for (int i = 0; i < argc; ++i) {
// CHECK1-NEXT: store %struct.anon* [[TMP8]], %struct.anon** [[__CONTEXT_ADDR_I]], align 8, !noalias !14
// CHECK1-NEXT: [[TMP10:%.*]] = load %struct.anon*, %struct.anon** [[__CONTEXT_ADDR_I]], align 8, !noalias !14
// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !14
-// CHECK1-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* @[[GLOB1]], i32 [[TMP11]], i32 4) #[[ATTR2:[0-9]+]]
+// CHECK1-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* @[[GLOB1]], i32 [[TMP11]], i32 4)
// CHECK1-NEXT: [[TMP13:%.*]] = icmp ne i32 [[TMP12]], 0
// CHECK1-NEXT: br i1 [[TMP13]], label [[DOTCANCEL_EXIT_I:%.*]], label [[DOTCANCEL_CONTINUE_I:%.*]]
// CHECK1: .cancel.exit.i:
@@ -1045,8 +1045,8 @@ for (int i = 0; i < argc; ++i) {
// CHECK3-NEXT: store i8* [[TMP9]], i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !14
// CHECK3-NEXT: store %struct.anon* [[TMP8]], %struct.anon** [[__CONTEXT_ADDR_I]], align 8, !noalias !14
// CHECK3-NEXT: [[TMP10:%.*]] = load %struct.anon*, %struct.anon** [[__CONTEXT_ADDR_I]], align 8, !noalias !14
-// CHECK3-NEXT: [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB12:[0-9]+]]) #[[ATTR2:[0-9]+]]
-// CHECK3-NEXT: [[TMP11:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i32 4) #[[ATTR2]]
+// CHECK3-NEXT: [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB12:[0-9]+]])
+// CHECK3-NEXT: [[TMP11:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i32 4)
// CHECK3-NEXT: [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
// CHECK3-NEXT: br i1 [[TMP12]], label [[DOTCANCEL_EXIT_I:%.*]], label [[DOTCANCEL_CONTINUE_I:%.*]]
// CHECK3: .cancel.exit.i:
diff --git a/clang/test/OpenMP/cancellation_point_codegen.cpp b/clang/test/OpenMP/cancellation_point_codegen.cpp
index 3f6ea6e6eabb8..3fd8d3ee47681 100644
--- a/clang/test/OpenMP/cancellation_point_codegen.cpp
+++ b/clang/test/OpenMP/cancellation_point_codegen.cpp
@@ -390,14 +390,14 @@ for (int i = 0; i < argc; ++i) {
// CHECK1-NEXT: store %struct.anon* [[TMP8]], %struct.anon** [[__CONTEXT_ADDR_I]], align 8, !noalias !14
// CHECK1-NEXT: [[TMP10:%.*]] = load %struct.anon*, %struct.anon** [[__CONTEXT_ADDR_I]], align 8, !noalias !14
// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !14
-// CHECK1-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_cancellationpoint(%struct.ident_t* @[[GLOB1]], i32 [[TMP11]], i32 4) #[[ATTR2:[0-9]+]]
+// CHECK1-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_cancellationpoint(%struct.ident_t* @[[GLOB1]], i32 [[TMP11]], i32 4)
// CHECK1-NEXT: [[TMP13:%.*]] = icmp ne i32 [[TMP12]], 0
// CHECK1-NEXT: br i1 [[TMP13]], label [[DOTCANCEL_EXIT_I:%.*]], label [[DOTCANCEL_CONTINUE_I:%.*]]
// CHECK1: .cancel.exit.i:
// CHECK1-NEXT: store i32 1, i32* [[CLEANUP_DEST_SLOT_I]], align 4, !noalias !14
// CHECK1-NEXT: br label [[DOTOMP_OUTLINED__1_EXIT:%.*]]
// CHECK1: .cancel.continue.i:
-// CHECK1-NEXT: [[TMP14:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* @[[GLOB1]], i32 [[TMP11]], i32 4) #[[ATTR2]]
+// CHECK1-NEXT: [[TMP14:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* @[[GLOB1]], i32 [[TMP11]], i32 4)
// CHECK1-NEXT: [[TMP15:%.*]] = icmp ne i32 [[TMP14]], 0
// CHECK1-NEXT: br i1 [[TMP15]], label [[DOTCANCEL_EXIT1_I:%.*]], label [[DOTCANCEL_CONTINUE2_I:%.*]]
// CHECK1: .cancel.exit1.i:
@@ -445,7 +445,7 @@ for (int i = 0; i < argc; ++i) {
// CHECK1-NEXT: store %struct.anon.0* [[TMP8]], %struct.anon.0** [[__CONTEXT_ADDR_I]], align 8, !noalias !24
// CHECK1-NEXT: [[TMP10:%.*]] = load %struct.anon.0*, %struct.anon.0** [[__CONTEXT_ADDR_I]], align 8, !noalias !24
// CHECK1-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !24
-// CHECK1-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_cancellationpoint(%struct.ident_t* @[[GLOB1]], i32 [[TMP11]], i32 4) #[[ATTR2]]
+// CHECK1-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_cancellationpoint(%struct.ident_t* @[[GLOB1]], i32 [[TMP11]], i32 4)
// CHECK1-NEXT: [[TMP13:%.*]] = icmp ne i32 [[TMP12]], 0
// CHECK1-NEXT: br i1 [[TMP13]], label [[DOTCANCEL_EXIT_I:%.*]], label [[DOTCANCEL_CONTINUE_I:%.*]]
// CHECK1: .cancel.exit.i:
diff --git a/clang/test/OpenMP/distribute_parallel_for_reduction_task_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_reduction_task_codegen.cpp
index 02ab6da2e4cdb..adefb908f249d 100644
--- a/clang/test/OpenMP/distribute_parallel_for_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_reduction_task_codegen.cpp
@@ -569,7 +569,7 @@ int main(int argc, char **argv) {
// CHECK1-NEXT: [[TMP19:%.*]] = load i8*, i8** [[TMP16]], align 8
// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !12
// CHECK1-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP18]] to i8*
-// CHECK1-NEXT: [[TMP22:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP19]], i8* [[TMP21]]) #[[ATTR5]]
+// CHECK1-NEXT: [[TMP22:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP19]], i8* [[TMP21]])
// CHECK1-NEXT: [[CONV_I:%.*]] = bitcast i8* [[TMP22]] to i32*
// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 2
// CHECK1-NEXT: [[TMP24:%.*]] = load i8**, i8*** [[TMP23]], align 8
@@ -592,7 +592,7 @@ int main(int argc, char **argv) {
// CHECK1-NEXT: [[TMP38:%.*]] = mul nuw i64 [[TMP37]], ptrtoint (i8* getelementptr (i8, i8* null, i32 1) to i64)
// CHECK1-NEXT: store i64 [[TMP37]], i64* @{{reduction_size[.].+[.]}}, align 8, !noalias !12
// CHECK1-NEXT: [[TMP39:%.*]] = load i8*, i8** [[TMP16]], align 8
-// CHECK1-NEXT: [[TMP40:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP39]], i8* [[TMP25]]) #[[ATTR5]]
+// CHECK1-NEXT: [[TMP40:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP39]], i8* [[TMP25]])
// CHECK1-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 2
// CHECK1-NEXT: [[TMP42:%.*]] = load i8**, i8*** [[TMP41]], align 8
// CHECK1-NEXT: [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 8
diff --git a/clang/test/OpenMP/for_reduction_task_codegen.cpp b/clang/test/OpenMP/for_reduction_task_codegen.cpp
index 876b840ec5ebc..cf1c1bcf8a529 100644
--- a/clang/test/OpenMP/for_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/for_reduction_task_codegen.cpp
@@ -478,7 +478,7 @@ int main(int argc, char **argv) {
// CHECK1-NEXT: [[TMP19:%.*]] = load i8*, i8** [[TMP16]], align 8
// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !12
// CHECK1-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP18]] to i8*
-// CHECK1-NEXT: [[TMP22:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP19]], i8* [[TMP21]]) #[[ATTR5]]
+// CHECK1-NEXT: [[TMP22:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP19]], i8* [[TMP21]])
// CHECK1-NEXT: [[CONV_I:%.*]] = bitcast i8* [[TMP22]] to i32*
// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 2
// CHECK1-NEXT: [[TMP24:%.*]] = load i8**, i8*** [[TMP23]], align 8
@@ -501,7 +501,7 @@ int main(int argc, char **argv) {
// CHECK1-NEXT: [[TMP38:%.*]] = mul nuw i64 [[TMP37]], ptrtoint (i8* getelementptr (i8, i8* null, i32 1) to i64)
// CHECK1-NEXT: store i64 [[TMP37]], i64* @{{reduction_size[.].+[.]}}, align 8, !noalias !12
// CHECK1-NEXT: [[TMP39:%.*]] = load i8*, i8** [[TMP16]], align 8
-// CHECK1-NEXT: [[TMP40:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP39]], i8* [[TMP25]]) #[[ATTR5]]
+// CHECK1-NEXT: [[TMP40:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP39]], i8* [[TMP25]])
// CHECK1-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 2
// CHECK1-NEXT: [[TMP42:%.*]] = load i8**, i8*** [[TMP41]], align 8
// CHECK1-NEXT: [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 8
diff --git a/clang/test/OpenMP/master_taskloop_in_reduction_codegen.cpp b/clang/test/OpenMP/master_taskloop_in_reduction_codegen.cpp
index a1a580852fe62..143dee5cdac06 100644
--- a/clang/test/OpenMP/master_taskloop_in_reduction_codegen.cpp
+++ b/clang/test/OpenMP/master_taskloop_in_reduction_codegen.cpp
@@ -631,18 +631,18 @@ int main(int argc, char **argv) {
// CHECK1-NEXT: [[TMP32:%.*]] = load i8*, i8** [[TMP28]], align 8
// CHECK1-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !14
// CHECK1-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP31]] to i8*
-// CHECK1-NEXT: [[TMP35:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP33]], i8* [[TMP32]], i8* [[TMP34]]) #[[ATTR3]]
+// CHECK1-NEXT: [[TMP35:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP33]], i8* [[TMP32]], i8* [[TMP34]])
// CHECK1-NEXT: [[CONV_I:%.*]] = bitcast i8* [[TMP35]] to i32*
// CHECK1-NEXT: [[TMP36:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP22]], i32 0, i32 2
// CHECK1-NEXT: [[TMP37:%.*]] = load i16*, i16** [[TMP36]], align 8
// CHECK1-NEXT: [[TMP38:%.*]] = mul nuw i64 [[TMP24]], 2
// CHECK1-NEXT: [[TMP39:%.*]] = udiv exact i64 [[TMP38]], ptrtoint (i16* getelementptr (i16, i16* null, i32 1) to i64)
-// CHECK1-NEXT: [[TMP40:%.*]] = call i8* @__kmpc_threadprivate_cached(%struct.ident_t* @[[GLOB1]], i32 [[TMP33]], i8* bitcast (i64* @{{reduction_size[.].+[.]}}) #[[ATTR3]]
+// CHECK1-NEXT: [[TMP40:%.*]] = call i8* @__kmpc_threadprivate_cached(%struct.ident_t* @[[GLOB1]], i32 [[TMP33]], i8* bitcast (i64* @{{reduction_size[.].+[.]}})
// CHECK1-NEXT: [[TMP41:%.*]] = bitcast i8* [[TMP40]] to i64*
// CHECK1-NEXT: store i64 [[TMP39]], i64* [[TMP41]], align 8
// CHECK1-NEXT: [[TMP42:%.*]] = load i8*, i8** [[TMP29]], align 8
// CHECK1-NEXT: [[TMP43:%.*]] = bitcast i16* [[TMP37]] to i8*
-// CHECK1-NEXT: [[TMP44:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP33]], i8* [[TMP42]], i8* [[TMP43]]) #[[ATTR3]]
+// CHECK1-NEXT: [[TMP44:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP33]], i8* [[TMP42]], i8* [[TMP43]])
// CHECK1-NEXT: [[CONV2_I:%.*]] = bitcast i8* [[TMP44]] to i16*
// CHECK1-NEXT: [[TMP45:%.*]] = load i64, i64* [[DOTLB__ADDR_I]], align 8, !noalias !14
// CHECK1-NEXT: [[CONV3_I:%.*]] = trunc i64 [[TMP45]] to i32
diff --git a/clang/test/OpenMP/master_taskloop_simd_in_reduction_codegen.cpp b/clang/test/OpenMP/master_taskloop_simd_in_reduction_codegen.cpp
index b73bdf48b4ee3..93088bb5d0557 100644
--- a/clang/test/OpenMP/master_taskloop_simd_in_reduction_codegen.cpp
+++ b/clang/test/OpenMP/master_taskloop_simd_in_reduction_codegen.cpp
@@ -631,18 +631,18 @@ int main(int argc, char **argv) {
// CHECK1-NEXT: [[TMP32:%.*]] = load i8*, i8** [[TMP28]], align 8
// CHECK1-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !14
// CHECK1-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP31]] to i8*
-// CHECK1-NEXT: [[TMP35:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP33]], i8* [[TMP32]], i8* [[TMP34]]) #[[ATTR3]]
+// CHECK1-NEXT: [[TMP35:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP33]], i8* [[TMP32]], i8* [[TMP34]])
// CHECK1-NEXT: [[CONV_I:%.*]] = bitcast i8* [[TMP35]] to i32*
// CHECK1-NEXT: [[TMP36:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP22]], i32 0, i32 2
// CHECK1-NEXT: [[TMP37:%.*]] = load i16*, i16** [[TMP36]], align 8
// CHECK1-NEXT: [[TMP38:%.*]] = mul nuw i64 [[TMP24]], 2
// CHECK1-NEXT: [[TMP39:%.*]] = udiv exact i64 [[TMP38]], ptrtoint (i16* getelementptr (i16, i16* null, i32 1) to i64)
-// CHECK1-NEXT: [[TMP40:%.*]] = call i8* @__kmpc_threadprivate_cached(%struct.ident_t* @[[GLOB1]], i32 [[TMP33]], i8* bitcast (i64* @{{reduction_size[.].+[.]}}) #[[ATTR3]]
+// CHECK1-NEXT: [[TMP40:%.*]] = call i8* @__kmpc_threadprivate_cached(%struct.ident_t* @[[GLOB1]], i32 [[TMP33]], i8* bitcast (i64* @{{reduction_size[.].+[.]}})
// CHECK1-NEXT: [[TMP41:%.*]] = bitcast i8* [[TMP40]] to i64*
// CHECK1-NEXT: store i64 [[TMP39]], i64* [[TMP41]], align 8
// CHECK1-NEXT: [[TMP42:%.*]] = load i8*, i8** [[TMP29]], align 8
// CHECK1-NEXT: [[TMP43:%.*]] = bitcast i16* [[TMP37]] to i8*
-// CHECK1-NEXT: [[TMP44:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP33]], i8* [[TMP42]], i8* [[TMP43]]) #[[ATTR3]]
+// CHECK1-NEXT: [[TMP44:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP33]], i8* [[TMP42]], i8* [[TMP43]])
// CHECK1-NEXT: [[CONV2_I:%.*]] = bitcast i8* [[TMP44]] to i16*
// CHECK1-NEXT: [[TMP45:%.*]] = load i64, i64* [[DOTLB__ADDR_I]], align 8, !noalias !14
// CHECK1-NEXT: [[CONV3_I:%.*]] = trunc i64 [[TMP45]] to i32
diff --git a/clang/test/OpenMP/parallel_for_reduction_task_codegen.cpp b/clang/test/OpenMP/parallel_for_reduction_task_codegen.cpp
index 8e718bd436e8b..c616cfafb7c58 100644
--- a/clang/test/OpenMP/parallel_for_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/parallel_for_reduction_task_codegen.cpp
@@ -473,7 +473,7 @@ int main(int argc, char **argv) {
// CHECK1-NEXT: [[TMP19:%.*]] = load i8*, i8** [[TMP16]], align 8
// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !12
// CHECK1-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP18]] to i8*
-// CHECK1-NEXT: [[TMP22:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP19]], i8* [[TMP21]]) #[[ATTR5]]
+// CHECK1-NEXT: [[TMP22:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP19]], i8* [[TMP21]])
// CHECK1-NEXT: [[CONV_I:%.*]] = bitcast i8* [[TMP22]] to i32*
// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 2
// CHECK1-NEXT: [[TMP24:%.*]] = load i8**, i8*** [[TMP23]], align 8
@@ -496,7 +496,7 @@ int main(int argc, char **argv) {
// CHECK1-NEXT: [[TMP38:%.*]] = mul nuw i64 [[TMP37]], ptrtoint (i8* getelementptr (i8, i8* null, i32 1) to i64)
// CHECK1-NEXT: store i64 [[TMP37]], i64* @{{reduction_size[.].+[.]}}, align 8, !noalias !12
// CHECK1-NEXT: [[TMP39:%.*]] = load i8*, i8** [[TMP16]], align 8
-// CHECK1-NEXT: [[TMP40:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP39]], i8* [[TMP25]]) #[[ATTR5]]
+// CHECK1-NEXT: [[TMP40:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP39]], i8* [[TMP25]])
// CHECK1-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 2
// CHECK1-NEXT: [[TMP42:%.*]] = load i8**, i8*** [[TMP41]], align 8
// CHECK1-NEXT: [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 8
diff --git a/clang/test/OpenMP/parallel_master_reduction_task_codegen.cpp b/clang/test/OpenMP/parallel_master_reduction_task_codegen.cpp
index 4b0d8d756c93c..9e3baa763f790 100644
--- a/clang/test/OpenMP/parallel_master_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/parallel_master_reduction_task_codegen.cpp
@@ -428,7 +428,7 @@ int main(int argc, char **argv) {
// CHECK1-NEXT: [[TMP19:%.*]] = load i8*, i8** [[TMP16]], align 8
// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !12
// CHECK1-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP18]] to i8*
-// CHECK1-NEXT: [[TMP22:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP19]], i8* [[TMP21]]) #[[ATTR5]]
+// CHECK1-NEXT: [[TMP22:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP19]], i8* [[TMP21]])
// CHECK1-NEXT: [[CONV_I:%.*]] = bitcast i8* [[TMP22]] to i32*
// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 2
// CHECK1-NEXT: [[TMP24:%.*]] = load i8**, i8*** [[TMP23]], align 8
@@ -451,7 +451,7 @@ int main(int argc, char **argv) {
// CHECK1-NEXT: [[TMP38:%.*]] = mul nuw i64 [[TMP37]], ptrtoint (i8* getelementptr (i8, i8* null, i32 1) to i64)
// CHECK1-NEXT: store i64 [[TMP37]], i64* @{{reduction_size[.].+[.]}}, align 8, !noalias !12
// CHECK1-NEXT: [[TMP39:%.*]] = load i8*, i8** [[TMP16]], align 8
-// CHECK1-NEXT: [[TMP40:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP39]], i8* [[TMP25]]) #[[ATTR5]]
+// CHECK1-NEXT: [[TMP40:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP39]], i8* [[TMP25]])
// CHECK1-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 2
// CHECK1-NEXT: [[TMP42:%.*]] = load i8**, i8*** [[TMP41]], align 8
// CHECK1-NEXT: [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 8
diff --git a/clang/test/OpenMP/parallel_master_taskloop_codegen.cpp b/clang/test/OpenMP/parallel_master_taskloop_codegen.cpp
index 4e9cc915fda05..24be347e462f2 100644
--- a/clang/test/OpenMP/parallel_master_taskloop_codegen.cpp
+++ b/clang/test/OpenMP/parallel_master_taskloop_codegen.cpp
@@ -721,7 +721,7 @@ struct S {
// CHECK1-NEXT: [[TMP24:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !52
// CHECK1-NEXT: store i32 [[TMP24]], i32* [[I_I]], align 4, !noalias !52
// CHECK1-NEXT: [[TMP25:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !52
-// CHECK1-NEXT: [[TMP26:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* @[[GLOB1]], i32 [[TMP25]], i32 4) #[[ATTR2]]
+// CHECK1-NEXT: [[TMP26:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* @[[GLOB1]], i32 [[TMP25]], i32 4)
// CHECK1-NEXT: [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0
// CHECK1-NEXT: br i1 [[TMP27]], label [[DOTCANCEL_EXIT_I:%.*]], label [[DOTCANCEL_CONTINUE_I:%.*]]
// CHECK1: .cancel.exit.i:
@@ -729,7 +729,7 @@ struct S {
// CHECK1-NEXT: br label [[DOTOMP_OUTLINED__9_EXIT:%.*]]
// CHECK1: .cancel.continue.i:
// CHECK1-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !52
-// CHECK1-NEXT: [[TMP29:%.*]] = call i32 @__kmpc_cancellationpoint(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]], i32 4) #[[ATTR2]]
+// CHECK1-NEXT: [[TMP29:%.*]] = call i32 @__kmpc_cancellationpoint(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]], i32 4)
// CHECK1-NEXT: [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0
// CHECK1-NEXT: br i1 [[TMP30]], label [[DOTCANCEL_EXIT2_I:%.*]], label [[DOTCANCEL_CONTINUE3_I:%.*]]
// CHECK1: .cancel.exit2.i:
diff --git a/clang/test/OpenMP/parallel_master_taskloop_lastprivate_codegen.cpp b/clang/test/OpenMP/parallel_master_taskloop_lastprivate_codegen.cpp
index ab72f7727c7a1..821be80a760f9 100644
--- a/clang/test/OpenMP/parallel_master_taskloop_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/parallel_master_taskloop_lastprivate_codegen.cpp
@@ -479,7 +479,7 @@ void loop() {
// CHECK1-NEXT: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [2 x %struct.S], [2 x %struct.S]* [[TMP42]], i64 0, i64 0
// CHECK1-NEXT: [[TMP50:%.*]] = bitcast %struct.S* [[ARRAYIDX6_I]] to i8*
// CHECK1-NEXT: [[TMP51:%.*]] = bitcast %struct.S* [[TMP40]] to i8*
-// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP50]], i8* align 8 [[TMP51]], i64 8, i1 false) #[[ATTR4]]
+// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP50]], i8* align 8 [[TMP51]], i64 8, i1 false)
// CHECK1-NEXT: store i32 33, i32* [[TMP44]], align 4
// CHECK1-NEXT: [[TMP52:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14
// CHECK1-NEXT: [[ADD7_I:%.*]] = add nsw i32 [[TMP52]], 1
@@ -492,7 +492,7 @@ void loop() {
// CHECK1: .omp.lastprivate.then.i:
// CHECK1-NEXT: [[TMP55:%.*]] = bitcast %struct.S* [[TMP27]] to i8*
// CHECK1-NEXT: [[TMP56:%.*]] = bitcast %struct.S* [[TMP40]] to i8*
-// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP55]], i8* align 8 [[TMP56]], i64 8, i1 false) #[[ATTR4]]
+// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP55]], i8* align 8 [[TMP56]], i64 8, i1 false)
// CHECK1-NEXT: [[TMP57:%.*]] = load i32, i32* [[TMP41]], align 4
// CHECK1-NEXT: store i32 [[TMP57]], i32* [[TMP29]], align 4
// CHECK1-NEXT: [[ARRAY_BEGIN_I:%.*]] = getelementptr inbounds [2 x %struct.S], [2 x %struct.S]* [[TMP31]], i32 0, i32 0
@@ -504,7 +504,7 @@ void loop() {
// CHECK1-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST_I:%.*]] = phi %struct.S* [ [[ARRAY_BEGIN_I]], [[DOTOMP_LASTPRIVATE_THEN_I]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT_I:%.*]], [[OMP_ARRAYCPY_BODY_I]] ]
// CHECK1-NEXT: [[TMP60:%.*]] = bitcast %struct.S* [[OMP_ARRAYCPY_DESTELEMENTPAST_I]] to i8*
// CHECK1-NEXT: [[TMP61:%.*]] = bitcast %struct.S* [[OMP_ARRAYCPY_SRCELEMENTPAST_I]] to i8*
-// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP60]], i8* align 8 [[TMP61]], i64 8, i1 false) #[[ATTR4]]
+// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP60]], i8* align 8 [[TMP61]], i64 8, i1 false)
// CHECK1-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT_I]] = getelementptr [[STRUCT_S]], %struct.S* [[OMP_ARRAYCPY_DESTELEMENTPAST_I]], i32 1
// CHECK1-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT_I]] = getelementptr [[STRUCT_S]], %struct.S* [[OMP_ARRAYCPY_SRCELEMENTPAST_I]], i32 1
// CHECK1-NEXT: [[OMP_ARRAYCPY_DONE_I:%.*]] = icmp eq %struct.S* [[OMP_ARRAYCPY_DEST_ELEMENT_I]], [[TMP59]]
@@ -512,7 +512,7 @@ void loop() {
// CHECK1: omp.arraycpy.done8.i:
// CHECK1-NEXT: [[TMP62:%.*]] = bitcast [2 x i32]* [[TMP33]] to i8*
// CHECK1-NEXT: [[TMP63:%.*]] = bitcast [2 x i32]* [[TMP43]] to i8*
-// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP62]], i8* align 4 [[TMP63]], i64 8, i1 false) #[[ATTR4]]
+// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP62]], i8* align 4 [[TMP63]], i64 8, i1 false)
// CHECK1-NEXT: [[TMP64:%.*]] = load i32, i32* [[TMP44]], align 4
// CHECK1-NEXT: store i32 [[TMP64]], i32* [[TMP39]], align 4
// CHECK1-NEXT: br label [[DOTOMP_OUTLINED__1_EXIT]]
@@ -891,7 +891,7 @@ void loop() {
// CHECK1-NEXT: [[ARRAYIDX5_I:%.*]] = getelementptr inbounds [2 x %struct.S.0], [2 x %struct.S.0]* [[TMP40]], i64 0, i64 0
// CHECK1-NEXT: [[TMP47:%.*]] = bitcast %struct.S.0* [[ARRAYIDX5_I]] to i8*
// CHECK1-NEXT: [[TMP48:%.*]] = bitcast %struct.S.0* [[TMP41]] to i8*
-// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP47]], i8* align 4 [[TMP48]], i64 4, i1 false) #[[ATTR4]]
+// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP47]], i8* align 4 [[TMP48]], i64 4, i1 false)
// CHECK1-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !28
// CHECK1-NEXT: [[ADD6_I:%.*]] = add nsw i32 [[TMP49]], 1
// CHECK1-NEXT: store i32 [[ADD6_I]], i32* [[DOTOMP_IV_I]], align 4, !noalias !28
@@ -905,7 +905,7 @@ void loop() {
// CHECK1-NEXT: store i32 [[TMP52]], i32* [[TMP27]], align 128
// CHECK1-NEXT: [[TMP53:%.*]] = bitcast [2 x i32]* [[TMP29]] to i8*
// CHECK1-NEXT: [[TMP54:%.*]] = bitcast [2 x i32]* [[TMP39]] to i8*
-// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP53]], i8* align 4 [[TMP54]], i64 8, i1 false) #[[ATTR4]]
+// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP53]], i8* align 4 [[TMP54]], i64 8, i1 false)
// CHECK1-NEXT: [[ARRAY_BEGIN_I:%.*]] = getelementptr inbounds [2 x %struct.S.0], [2 x %struct.S.0]* [[TMP31]], i32 0, i32 0
// CHECK1-NEXT: [[TMP55:%.*]] = bitcast [2 x %struct.S.0]* [[TMP40]] to %struct.S.0*
// CHECK1-NEXT: [[TMP56:%.*]] = getelementptr [[STRUCT_S_0:%.*]], %struct.S.0* [[ARRAY_BEGIN_I]], i64 2
@@ -915,7 +915,7 @@ void loop() {
// CHECK1-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST_I:%.*]] = phi %struct.S.0* [ [[ARRAY_BEGIN_I]], [[DOTOMP_LASTPRIVATE_THEN_I]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT_I:%.*]], [[OMP_ARRAYCPY_BODY_I]] ]
// CHECK1-NEXT: [[TMP57:%.*]] = bitcast %struct.S.0* [[OMP_ARRAYCPY_DESTELEMENTPAST_I]] to i8*
// CHECK1-NEXT: [[TMP58:%.*]] = bitcast %struct.S.0* [[OMP_ARRAYCPY_SRCELEMENTPAST_I]] to i8*
-// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP57]], i8* align 4 [[TMP58]], i64 4, i1 false) #[[ATTR4]]
+// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP57]], i8* align 4 [[TMP58]], i64 4, i1 false)
// CHECK1-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT_I]] = getelementptr [[STRUCT_S_0]], %struct.S.0* [[OMP_ARRAYCPY_DESTELEMENTPAST_I]], i32 1
// CHECK1-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT_I]] = getelementptr [[STRUCT_S_0]], %struct.S.0* [[OMP_ARRAYCPY_SRCELEMENTPAST_I]], i32 1
// CHECK1-NEXT: [[OMP_ARRAYCPY_DONE_I:%.*]] = icmp eq %struct.S.0* [[OMP_ARRAYCPY_DEST_ELEMENT_I]], [[TMP56]]
@@ -923,7 +923,7 @@ void loop() {
// CHECK1: omp.arraycpy.done7.i:
// CHECK1-NEXT: [[TMP59:%.*]] = bitcast %struct.S.0* [[TMP35]] to i8*
// CHECK1-NEXT: [[TMP60:%.*]] = bitcast %struct.S.0* [[TMP41]] to i8*
-// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP59]], i8* align 4 [[TMP60]], i64 4, i1 false) #[[ATTR4]]
+// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP59]], i8* align 4 [[TMP60]], i64 4, i1 false)
// CHECK1-NEXT: br label [[DOTOMP_OUTLINED__3_EXIT]]
// CHECK1: .omp_outlined..3.exit:
// CHECK1-NEXT: ret i32 0
@@ -1195,7 +1195,7 @@ void loop() {
// CHECK3-NEXT: store double* [[TMP30]], double** [[TMP36]], align 8, !noalias !14
// CHECK3-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], %class.anon.0* [[REF_TMP_I]], i32 0, i32 1
// CHECK3-NEXT: store i32* [[TMP31]], i32** [[TMP37]], align 8, !noalias !14
-// CHECK3-NEXT: call void @"_ZZZ4mainENK3$_0clEvENKUlvE_clEv"(%class.anon.0* noundef nonnull align 8 dereferenceable(16) [[REF_TMP_I]]) #[[ATTR3]]
+// CHECK3-NEXT: call void @"_ZZZ4mainENK3$_0clEvENKUlvE_clEv"(%class.anon.0* noundef nonnull align 8 dereferenceable(16) [[REF_TMP_I]])
// CHECK3-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14
// CHECK3-NEXT: [[ADD3_I:%.*]] = add nsw i32 [[TMP38]], 1
// CHECK3-NEXT: store i32 [[ADD3_I]], i32* [[DOTOMP_IV_I]], align 4, !noalias !14
diff --git a/clang/test/OpenMP/parallel_master_taskloop_simd_lastprivate_codegen.cpp b/clang/test/OpenMP/parallel_master_taskloop_simd_lastprivate_codegen.cpp
index 7fa0924e3b681..ef015914d6710 100644
--- a/clang/test/OpenMP/parallel_master_taskloop_simd_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/parallel_master_taskloop_simd_lastprivate_codegen.cpp
@@ -479,7 +479,7 @@ void loop() {
// CHECK1-NEXT: [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [2 x %struct.S], [2 x %struct.S]* [[TMP42]], i64 0, i64 0
// CHECK1-NEXT: [[TMP50:%.*]] = bitcast %struct.S* [[ARRAYIDX6_I]] to i8*
// CHECK1-NEXT: [[TMP51:%.*]] = bitcast %struct.S* [[TMP40]] to i8*
-// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP50]], i8* align 8 [[TMP51]], i64 8, i1 false) #[[ATTR4]], !llvm.access.group [[ACC_GRP15]]
+// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP50]], i8* align 8 [[TMP51]], i64 8, i1 false), !llvm.access.group [[ACC_GRP15]]
// CHECK1-NEXT: store i32 33, i32* [[TMP44]], align 4, !llvm.access.group [[ACC_GRP15]]
// CHECK1-NEXT: [[TMP52:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14, !llvm.access.group [[ACC_GRP15]]
// CHECK1-NEXT: [[ADD7_I:%.*]] = add nsw i32 [[TMP52]], 1
@@ -492,7 +492,7 @@ void loop() {
// CHECK1: .omp.lastprivate.then.i:
// CHECK1-NEXT: [[TMP55:%.*]] = bitcast %struct.S* [[TMP27]] to i8*
// CHECK1-NEXT: [[TMP56:%.*]] = bitcast %struct.S* [[TMP40]] to i8*
-// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP55]], i8* align 8 [[TMP56]], i64 8, i1 false) #[[ATTR4]]
+// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP55]], i8* align 8 [[TMP56]], i64 8, i1 false)
// CHECK1-NEXT: [[TMP57:%.*]] = load i32, i32* [[TMP41]], align 4
// CHECK1-NEXT: store i32 [[TMP57]], i32* [[TMP29]], align 4
// CHECK1-NEXT: [[ARRAY_BEGIN_I:%.*]] = getelementptr inbounds [2 x %struct.S], [2 x %struct.S]* [[TMP31]], i32 0, i32 0
@@ -504,7 +504,7 @@ void loop() {
// CHECK1-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST_I:%.*]] = phi %struct.S* [ [[ARRAY_BEGIN_I]], [[DOTOMP_LASTPRIVATE_THEN_I]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT_I:%.*]], [[OMP_ARRAYCPY_BODY_I]] ]
// CHECK1-NEXT: [[TMP60:%.*]] = bitcast %struct.S* [[OMP_ARRAYCPY_DESTELEMENTPAST_I]] to i8*
// CHECK1-NEXT: [[TMP61:%.*]] = bitcast %struct.S* [[OMP_ARRAYCPY_SRCELEMENTPAST_I]] to i8*
-// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP60]], i8* align 8 [[TMP61]], i64 8, i1 false) #[[ATTR4]]
+// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP60]], i8* align 8 [[TMP61]], i64 8, i1 false)
// CHECK1-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT_I]] = getelementptr [[STRUCT_S]], %struct.S* [[OMP_ARRAYCPY_DESTELEMENTPAST_I]], i32 1
// CHECK1-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT_I]] = getelementptr [[STRUCT_S]], %struct.S* [[OMP_ARRAYCPY_SRCELEMENTPAST_I]], i32 1
// CHECK1-NEXT: [[OMP_ARRAYCPY_DONE_I:%.*]] = icmp eq %struct.S* [[OMP_ARRAYCPY_DEST_ELEMENT_I]], [[TMP59]]
@@ -512,7 +512,7 @@ void loop() {
// CHECK1: omp.arraycpy.done8.i:
// CHECK1-NEXT: [[TMP62:%.*]] = bitcast [2 x i32]* [[TMP33]] to i8*
// CHECK1-NEXT: [[TMP63:%.*]] = bitcast [2 x i32]* [[TMP43]] to i8*
-// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP62]], i8* align 4 [[TMP63]], i64 8, i1 false) #[[ATTR4]]
+// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP62]], i8* align 4 [[TMP63]], i64 8, i1 false)
// CHECK1-NEXT: [[TMP64:%.*]] = load i32, i32* [[TMP44]], align 4
// CHECK1-NEXT: store i32 [[TMP64]], i32* [[TMP39]], align 4
// CHECK1-NEXT: br label [[DOTOMP_OUTLINED__1_EXIT]]
@@ -891,7 +891,7 @@ void loop() {
// CHECK1-NEXT: [[ARRAYIDX5_I:%.*]] = getelementptr inbounds [2 x %struct.S.0], [2 x %struct.S.0]* [[TMP40]], i64 0, i64 0
// CHECK1-NEXT: [[TMP47:%.*]] = bitcast %struct.S.0* [[ARRAYIDX5_I]] to i8*
// CHECK1-NEXT: [[TMP48:%.*]] = bitcast %struct.S.0* [[TMP41]] to i8*
-// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP47]], i8* align 4 [[TMP48]], i64 4, i1 false) #[[ATTR4]], !llvm.access.group [[ACC_GRP33]]
+// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP47]], i8* align 4 [[TMP48]], i64 4, i1 false), !llvm.access.group [[ACC_GRP33]]
// CHECK1-NEXT: [[TMP49:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !32, !llvm.access.group [[ACC_GRP33]]
// CHECK1-NEXT: [[ADD6_I:%.*]] = add nsw i32 [[TMP49]], 1
// CHECK1-NEXT: store i32 [[ADD6_I]], i32* [[DOTOMP_IV_I]], align 4, !noalias !32, !llvm.access.group [[ACC_GRP33]]
@@ -905,7 +905,7 @@ void loop() {
// CHECK1-NEXT: store i32 [[TMP52]], i32* [[TMP27]], align 128
// CHECK1-NEXT: [[TMP53:%.*]] = bitcast [2 x i32]* [[TMP29]] to i8*
// CHECK1-NEXT: [[TMP54:%.*]] = bitcast [2 x i32]* [[TMP39]] to i8*
-// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP53]], i8* align 4 [[TMP54]], i64 8, i1 false) #[[ATTR4]]
+// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP53]], i8* align 4 [[TMP54]], i64 8, i1 false)
// CHECK1-NEXT: [[ARRAY_BEGIN_I:%.*]] = getelementptr inbounds [2 x %struct.S.0], [2 x %struct.S.0]* [[TMP31]], i32 0, i32 0
// CHECK1-NEXT: [[TMP55:%.*]] = bitcast [2 x %struct.S.0]* [[TMP40]] to %struct.S.0*
// CHECK1-NEXT: [[TMP56:%.*]] = getelementptr [[STRUCT_S_0:%.*]], %struct.S.0* [[ARRAY_BEGIN_I]], i64 2
@@ -915,7 +915,7 @@ void loop() {
// CHECK1-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST_I:%.*]] = phi %struct.S.0* [ [[ARRAY_BEGIN_I]], [[DOTOMP_LASTPRIVATE_THEN_I]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT_I:%.*]], [[OMP_ARRAYCPY_BODY_I]] ]
// CHECK1-NEXT: [[TMP57:%.*]] = bitcast %struct.S.0* [[OMP_ARRAYCPY_DESTELEMENTPAST_I]] to i8*
// CHECK1-NEXT: [[TMP58:%.*]] = bitcast %struct.S.0* [[OMP_ARRAYCPY_SRCELEMENTPAST_I]] to i8*
-// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP57]], i8* align 4 [[TMP58]], i64 4, i1 false) #[[ATTR4]]
+// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP57]], i8* align 4 [[TMP58]], i64 4, i1 false)
// CHECK1-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT_I]] = getelementptr [[STRUCT_S_0]], %struct.S.0* [[OMP_ARRAYCPY_DESTELEMENTPAST_I]], i32 1
// CHECK1-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT_I]] = getelementptr [[STRUCT_S_0]], %struct.S.0* [[OMP_ARRAYCPY_SRCELEMENTPAST_I]], i32 1
// CHECK1-NEXT: [[OMP_ARRAYCPY_DONE_I:%.*]] = icmp eq %struct.S.0* [[OMP_ARRAYCPY_DEST_ELEMENT_I]], [[TMP56]]
@@ -923,7 +923,7 @@ void loop() {
// CHECK1: omp.arraycpy.done7.i:
// CHECK1-NEXT: [[TMP59:%.*]] = bitcast %struct.S.0* [[TMP35]] to i8*
// CHECK1-NEXT: [[TMP60:%.*]] = bitcast %struct.S.0* [[TMP41]] to i8*
-// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP59]], i8* align 4 [[TMP60]], i64 4, i1 false) #[[ATTR4]]
+// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP59]], i8* align 4 [[TMP60]], i64 4, i1 false)
// CHECK1-NEXT: br label [[DOTOMP_OUTLINED__3_EXIT]]
// CHECK1: .omp_outlined..3.exit:
// CHECK1-NEXT: ret i32 0
@@ -1195,7 +1195,7 @@ void loop() {
// CHECK3-NEXT: store double* [[TMP30]], double** [[TMP36]], align 8, !noalias !14, !llvm.access.group [[ACC_GRP15]]
// CHECK3-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], %class.anon.0* [[REF_TMP_I]], i32 0, i32 1
// CHECK3-NEXT: store i32* [[TMP31]], i32** [[TMP37]], align 8, !noalias !14, !llvm.access.group [[ACC_GRP15]]
-// CHECK3-NEXT: call void @"_ZZZ4mainENK3$_0clEvENKUlvE_clEv"(%class.anon.0* noundef nonnull align 8 dereferenceable(16) [[REF_TMP_I]]) #[[ATTR3]], !llvm.access.group [[ACC_GRP15]]
+// CHECK3-NEXT: call void @"_ZZZ4mainENK3$_0clEvENKUlvE_clEv"(%class.anon.0* noundef nonnull align 8 dereferenceable(16) [[REF_TMP_I]]), !llvm.access.group [[ACC_GRP15]]
// CHECK3-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14, !llvm.access.group [[ACC_GRP15]]
// CHECK3-NEXT: [[ADD3_I:%.*]] = add nsw i32 [[TMP38]], 1
// CHECK3-NEXT: store i32 [[ADD3_I]], i32* [[DOTOMP_IV_I]], align 4, !noalias !14, !llvm.access.group [[ACC_GRP15]]
diff --git a/clang/test/OpenMP/parallel_reduction_task_codegen.cpp b/clang/test/OpenMP/parallel_reduction_task_codegen.cpp
index 4fcc6940db8db..819425ec29cb5 100644
--- a/clang/test/OpenMP/parallel_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/parallel_reduction_task_codegen.cpp
@@ -419,7 +419,7 @@ int main(int argc, char **argv) {
// CHECK1-NEXT: [[TMP19:%.*]] = load i8*, i8** [[TMP16]], align 8
// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !12
// CHECK1-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP18]] to i8*
-// CHECK1-NEXT: [[TMP22:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP19]], i8* [[TMP21]]) #[[ATTR5]]
+// CHECK1-NEXT: [[TMP22:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP19]], i8* [[TMP21]])
// CHECK1-NEXT: [[CONV_I:%.*]] = bitcast i8* [[TMP22]] to i32*
// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 2
// CHECK1-NEXT: [[TMP24:%.*]] = load i8**, i8*** [[TMP23]], align 8
@@ -442,7 +442,7 @@ int main(int argc, char **argv) {
// CHECK1-NEXT: [[TMP38:%.*]] = mul nuw i64 [[TMP37]], ptrtoint (i8* getelementptr (i8, i8* null, i32 1) to i64)
// CHECK1-NEXT: store i64 [[TMP37]], i64* @{{reduction_size[.].+[.]}}, align 8, !noalias !12
// CHECK1-NEXT: [[TMP39:%.*]] = load i8*, i8** [[TMP16]], align 8
-// CHECK1-NEXT: [[TMP40:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP39]], i8* [[TMP25]]) #[[ATTR5]]
+// CHECK1-NEXT: [[TMP40:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP39]], i8* [[TMP25]])
// CHECK1-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 2
// CHECK1-NEXT: [[TMP42:%.*]] = load i8**, i8*** [[TMP41]], align 8
// CHECK1-NEXT: [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 8
diff --git a/clang/test/OpenMP/parallel_sections_reduction_task_codegen.cpp b/clang/test/OpenMP/parallel_sections_reduction_task_codegen.cpp
index 57e3e21b01f73..035b72adc3bc8 100644
--- a/clang/test/OpenMP/parallel_sections_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/parallel_sections_reduction_task_codegen.cpp
@@ -461,7 +461,7 @@ int main(int argc, char **argv) {
// CHECK1-NEXT: [[TMP19:%.*]] = load i8*, i8** [[TMP16]], align 8
// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !12
// CHECK1-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP18]] to i8*
-// CHECK1-NEXT: [[TMP22:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP19]], i8* [[TMP21]]) #[[ATTR5]]
+// CHECK1-NEXT: [[TMP22:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP19]], i8* [[TMP21]])
// CHECK1-NEXT: [[CONV_I:%.*]] = bitcast i8* [[TMP22]] to i32*
// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 2
// CHECK1-NEXT: [[TMP24:%.*]] = load i8**, i8*** [[TMP23]], align 8
@@ -484,7 +484,7 @@ int main(int argc, char **argv) {
// CHECK1-NEXT: [[TMP38:%.*]] = mul nuw i64 [[TMP37]], ptrtoint (i8* getelementptr (i8, i8* null, i32 1) to i64)
// CHECK1-NEXT: store i64 [[TMP37]], i64* @{{reduction_size[.].+[.]}}, align 8, !noalias !12
// CHECK1-NEXT: [[TMP39:%.*]] = load i8*, i8** [[TMP16]], align 8
-// CHECK1-NEXT: [[TMP40:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP39]], i8* [[TMP25]]) #[[ATTR5]]
+// CHECK1-NEXT: [[TMP40:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP39]], i8* [[TMP25]])
// CHECK1-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 2
// CHECK1-NEXT: [[TMP42:%.*]] = load i8**, i8*** [[TMP41]], align 8
// CHECK1-NEXT: [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 8
diff --git a/clang/test/OpenMP/sections_reduction_task_codegen.cpp b/clang/test/OpenMP/sections_reduction_task_codegen.cpp
index 658456951d220..9258ea653afca 100644
--- a/clang/test/OpenMP/sections_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/sections_reduction_task_codegen.cpp
@@ -466,7 +466,7 @@ int main(int argc, char **argv) {
// CHECK1-NEXT: [[TMP19:%.*]] = load i8*, i8** [[TMP16]], align 8
// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !12
// CHECK1-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP18]] to i8*
-// CHECK1-NEXT: [[TMP22:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP19]], i8* [[TMP21]]) #[[ATTR5]]
+// CHECK1-NEXT: [[TMP22:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP19]], i8* [[TMP21]])
// CHECK1-NEXT: [[CONV_I:%.*]] = bitcast i8* [[TMP22]] to i32*
// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 2
// CHECK1-NEXT: [[TMP24:%.*]] = load i8**, i8*** [[TMP23]], align 8
@@ -489,7 +489,7 @@ int main(int argc, char **argv) {
// CHECK1-NEXT: [[TMP38:%.*]] = mul nuw i64 [[TMP37]], ptrtoint (i8* getelementptr (i8, i8* null, i32 1) to i64)
// CHECK1-NEXT: store i64 [[TMP37]], i64* @{{reduction_size[.].+[.]}}, align 8, !noalias !12
// CHECK1-NEXT: [[TMP39:%.*]] = load i8*, i8** [[TMP16]], align 8
-// CHECK1-NEXT: [[TMP40:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP39]], i8* [[TMP25]]) #[[ATTR5]]
+// CHECK1-NEXT: [[TMP40:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP39]], i8* [[TMP25]])
// CHECK1-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 2
// CHECK1-NEXT: [[TMP42:%.*]] = load i8**, i8*** [[TMP41]], align 8
// CHECK1-NEXT: [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 8
diff --git a/clang/test/OpenMP/target_in_reduction_codegen.cpp b/clang/test/OpenMP/target_in_reduction_codegen.cpp
index d16a756151ba9..6ffbc48dd0d9b 100644
--- a/clang/test/OpenMP/target_in_reduction_codegen.cpp
+++ b/clang/test/OpenMP/target_in_reduction_codegen.cpp
@@ -612,7 +612,7 @@ int main(int argc, char **argv) {
// CHECK1-NEXT: [[TMP21:%.*]] = load i8*, i8** [[TMP18]], align 8
// CHECK1-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !14
// CHECK1-NEXT: [[TMP23:%.*]] = bitcast i32* [[TMP20]] to i8*
-// CHECK1-NEXT: [[TMP24:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP22]], i8* [[TMP21]], i8* [[TMP23]]) #[[ATTR3]]
+// CHECK1-NEXT: [[TMP24:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP22]], i8* [[TMP21]], i8* [[TMP23]])
// CHECK1-NEXT: [[CONV_I:%.*]] = bitcast i8* [[TMP24]] to i32*
// CHECK1-NEXT: [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 0
// CHECK1-NEXT: [[TMP26:%.*]] = load i32*, i32** [[TMP25]], align 8
diff --git a/clang/test/OpenMP/target_parallel_codegen.cpp b/clang/test/OpenMP/target_parallel_codegen.cpp
index d3a68bdd97213..9129aa695c99f 100644
--- a/clang/test/OpenMP/target_parallel_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_codegen.cpp
@@ -641,7 +641,7 @@ int bar(int n){
// CHECK1-NEXT: store i8** null, i8*** [[TMP18]], align 8, !noalias !21
// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 8
// CHECK1-NEXT: store i64 0, i64* [[TMP19]], align 8, !noalias !21
-// CHECK1-NEXT: [[TMP20:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB1]], i64 -1, i32 1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l100.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null) #[[ATTR4]]
+// CHECK1-NEXT: [[TMP20:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB1]], i64 -1, i32 1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l100.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null)
// CHECK1-NEXT: [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
// CHECK1-NEXT: br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__1_EXIT:%.*]]
// CHECK1: omp_offload.failed.i:
@@ -1783,7 +1783,7 @@ int bar(int n){
// CHECK3-NEXT: store i8** null, i8*** [[TMP18]], align 4, !noalias !22
// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 8
// CHECK3-NEXT: store i64 0, i64* [[TMP19]], align 8, !noalias !22
-// CHECK3-NEXT: [[TMP20:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB1]], i64 -1, i32 1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l100.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null) #[[ATTR4]]
+// CHECK3-NEXT: [[TMP20:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB1]], i64 -1, i32 1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l100.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null)
// CHECK3-NEXT: [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
// CHECK3-NEXT: br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__1_EXIT:%.*]]
// CHECK3: omp_offload.failed.i:
diff --git a/clang/test/OpenMP/target_parallel_for_codegen.cpp b/clang/test/OpenMP/target_parallel_for_codegen.cpp
index e0743981e5b78..3b7e1d038ef48 100644
--- a/clang/test/OpenMP/target_parallel_for_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_for_codegen.cpp
@@ -1117,7 +1117,7 @@ int bar(int n){
// CHECK1-NEXT: store i8** null, i8*** [[TMP32]], align 8, !noalias !24
// CHECK1-NEXT: [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 8
// CHECK1-NEXT: store i64 0, i64* [[TMP33]], align 8, !noalias !24
-// CHECK1-NEXT: [[TMP34:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l138.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null) #[[ATTR4]]
+// CHECK1-NEXT: [[TMP34:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l138.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null)
// CHECK1-NEXT: [[TMP35:%.*]] = icmp ne i32 [[TMP34]], 0
// CHECK1-NEXT: br i1 [[TMP35]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__3_EXIT:%.*]]
// CHECK1: omp_offload.failed.i:
@@ -2847,7 +2847,7 @@ int bar(int n){
// CHECK3-NEXT: store i8** null, i8*** [[TMP32]], align 4, !noalias !25
// CHECK3-NEXT: [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 8
// CHECK3-NEXT: store i64 0, i64* [[TMP33]], align 8, !noalias !25
-// CHECK3-NEXT: [[TMP34:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l138.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null) #[[ATTR4]]
+// CHECK3-NEXT: [[TMP34:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l138.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null)
// CHECK3-NEXT: [[TMP35:%.*]] = icmp ne i32 [[TMP34]], 0
// CHECK3-NEXT: br i1 [[TMP35]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__3_EXIT:%.*]]
// CHECK3: omp_offload.failed.i:
@@ -6159,7 +6159,7 @@ int bar(int n){
// CHECK17-NEXT: store i8** null, i8*** [[TMP32]], align 8, !noalias !24
// CHECK17-NEXT: [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 8
// CHECK17-NEXT: store i64 0, i64* [[TMP33]], align 8, !noalias !24
-// CHECK17-NEXT: [[TMP34:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l138.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null) #[[ATTR4]]
+// CHECK17-NEXT: [[TMP34:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l138.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null)
// CHECK17-NEXT: [[TMP35:%.*]] = icmp ne i32 [[TMP34]], 0
// CHECK17-NEXT: br i1 [[TMP35]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__3_EXIT:%.*]]
// CHECK17: omp_offload.failed.i:
@@ -7889,7 +7889,7 @@ int bar(int n){
// CHECK19-NEXT: store i8** null, i8*** [[TMP32]], align 4, !noalias !25
// CHECK19-NEXT: [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 8
// CHECK19-NEXT: store i64 0, i64* [[TMP33]], align 8, !noalias !25
-// CHECK19-NEXT: [[TMP34:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l138.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null) #[[ATTR4]]
+// CHECK19-NEXT: [[TMP34:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l138.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null)
// CHECK19-NEXT: [[TMP35:%.*]] = icmp ne i32 [[TMP34]], 0
// CHECK19-NEXT: br i1 [[TMP35]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__3_EXIT:%.*]]
// CHECK19: omp_offload.failed.i:
diff --git a/clang/test/OpenMP/target_parallel_for_reduction_task_codegen.cpp b/clang/test/OpenMP/target_parallel_for_reduction_task_codegen.cpp
index 61c384c9f5ea4..0176fa4266315 100644
--- a/clang/test/OpenMP/target_parallel_for_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_for_reduction_task_codegen.cpp
@@ -486,7 +486,7 @@ int main(int argc, char **argv) {
// CHECK1-NEXT: [[TMP19:%.*]] = load i8*, i8** [[TMP16]], align 8
// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !12
// CHECK1-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP18]] to i8*
-// CHECK1-NEXT: [[TMP22:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP19]], i8* [[TMP21]]) #[[ATTR5]]
+// CHECK1-NEXT: [[TMP22:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP19]], i8* [[TMP21]])
// CHECK1-NEXT: [[CONV_I:%.*]] = bitcast i8* [[TMP22]] to i32*
// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 2
// CHECK1-NEXT: [[TMP24:%.*]] = load i8**, i8*** [[TMP23]], align 8
@@ -509,7 +509,7 @@ int main(int argc, char **argv) {
// CHECK1-NEXT: [[TMP38:%.*]] = mul nuw i64 [[TMP37]], ptrtoint (i8* getelementptr (i8, i8* null, i32 1) to i64)
// CHECK1-NEXT: store i64 [[TMP37]], i64* @{{reduction_size[.].+[.]}}, align 8, !noalias !12
// CHECK1-NEXT: [[TMP39:%.*]] = load i8*, i8** [[TMP16]], align 8
-// CHECK1-NEXT: [[TMP40:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP39]], i8* [[TMP25]]) #[[ATTR5]]
+// CHECK1-NEXT: [[TMP40:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP39]], i8* [[TMP25]])
// CHECK1-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 2
// CHECK1-NEXT: [[TMP42:%.*]] = load i8**, i8*** [[TMP41]], align 8
// CHECK1-NEXT: [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 8
diff --git a/clang/test/OpenMP/target_parallel_for_simd_codegen.cpp b/clang/test/OpenMP/target_parallel_for_simd_codegen.cpp
index 484e3d28c1feb..2ef121e21153e 100644
--- a/clang/test/OpenMP/target_parallel_for_simd_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_for_simd_codegen.cpp
@@ -757,7 +757,7 @@ int bar(int n){
// CHECK1-NEXT: store i8** null, i8*** [[TMP18]], align 8, !noalias !25
// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 8
// CHECK1-NEXT: store i64 0, i64* [[TMP19]], align 8, !noalias !25
-// CHECK1-NEXT: [[TMP20:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l96.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null) #[[ATTR4]]
+// CHECK1-NEXT: [[TMP20:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l96.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null)
// CHECK1-NEXT: [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
// CHECK1-NEXT: br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__1_EXIT:%.*]]
// CHECK1: omp_offload.failed.i:
@@ -2438,7 +2438,7 @@ int bar(int n){
// CHECK3-NEXT: store i8** null, i8*** [[TMP18]], align 4, !noalias !26
// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 8
// CHECK3-NEXT: store i64 0, i64* [[TMP19]], align 8, !noalias !26
-// CHECK3-NEXT: [[TMP20:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l96.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null) #[[ATTR4]]
+// CHECK3-NEXT: [[TMP20:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l96.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null)
// CHECK3-NEXT: [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
// CHECK3-NEXT: br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__1_EXIT:%.*]]
// CHECK3: omp_offload.failed.i:
@@ -4097,7 +4097,7 @@ int bar(int n){
// CHECK5-NEXT: store i8** null, i8*** [[TMP18]], align 8, !noalias !25
// CHECK5-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 8
// CHECK5-NEXT: store i64 0, i64* [[TMP19]], align 8, !noalias !25
-// CHECK5-NEXT: [[TMP20:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l96.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null) #[[ATTR4]]
+// CHECK5-NEXT: [[TMP20:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l96.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null)
// CHECK5-NEXT: [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
// CHECK5-NEXT: br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__1_EXIT:%.*]]
// CHECK5: omp_offload.failed.i:
@@ -5890,7 +5890,7 @@ int bar(int n){
// CHECK7-NEXT: store i8** null, i8*** [[TMP18]], align 4, !noalias !26
// CHECK7-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 8
// CHECK7-NEXT: store i64 0, i64* [[TMP19]], align 8, !noalias !26
-// CHECK7-NEXT: [[TMP20:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l96.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null) #[[ATTR4]]
+// CHECK7-NEXT: [[TMP20:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l96.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null)
// CHECK7-NEXT: [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
// CHECK7-NEXT: br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__1_EXIT:%.*]]
// CHECK7: omp_offload.failed.i:
diff --git a/clang/test/OpenMP/target_parallel_reduction_task_codegen.cpp b/clang/test/OpenMP/target_parallel_reduction_task_codegen.cpp
index b4773114c3bd7..95b591365bb24 100644
--- a/clang/test/OpenMP/target_parallel_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_reduction_task_codegen.cpp
@@ -432,7 +432,7 @@ int main(int argc, char **argv) {
// CHECK1-NEXT: [[TMP19:%.*]] = load i8*, i8** [[TMP16]], align 8
// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !12
// CHECK1-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP18]] to i8*
-// CHECK1-NEXT: [[TMP22:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP19]], i8* [[TMP21]]) #[[ATTR5]]
+// CHECK1-NEXT: [[TMP22:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP19]], i8* [[TMP21]])
// CHECK1-NEXT: [[CONV_I:%.*]] = bitcast i8* [[TMP22]] to i32*
// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 2
// CHECK1-NEXT: [[TMP24:%.*]] = load i8**, i8*** [[TMP23]], align 8
@@ -455,7 +455,7 @@ int main(int argc, char **argv) {
// CHECK1-NEXT: [[TMP38:%.*]] = mul nuw i64 [[TMP37]], ptrtoint (i8* getelementptr (i8, i8* null, i32 1) to i64)
// CHECK1-NEXT: store i64 [[TMP37]], i64* @{{reduction_size[.].+[.]}}, align 8, !noalias !12
// CHECK1-NEXT: [[TMP39:%.*]] = load i8*, i8** [[TMP16]], align 8
-// CHECK1-NEXT: [[TMP40:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP39]], i8* [[TMP25]]) #[[ATTR5]]
+// CHECK1-NEXT: [[TMP40:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP39]], i8* [[TMP25]])
// CHECK1-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 2
// CHECK1-NEXT: [[TMP42:%.*]] = load i8**, i8*** [[TMP41]], align 8
// CHECK1-NEXT: [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 8
diff --git a/clang/test/OpenMP/target_teams_codegen.cpp b/clang/test/OpenMP/target_teams_codegen.cpp
index e6cc6ce2298a4..e8e2be8558c74 100644
--- a/clang/test/OpenMP/target_teams_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_codegen.cpp
@@ -895,7 +895,7 @@ int bar(int n){
// CHECK1-NEXT: store i8** null, i8*** [[TMP34]], align 8, !noalias !24
// CHECK1-NEXT: [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 8
// CHECK1-NEXT: store i64 0, i64* [[TMP35]], align 8, !noalias !24
-// CHECK1-NEXT: [[TMP36:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB1]], i64 -1, i32 [[TMP25]], i32 [[TMP26]], i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l101.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null) #[[ATTR3]]
+// CHECK1-NEXT: [[TMP36:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB1]], i64 -1, i32 [[TMP25]], i32 [[TMP26]], i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l101.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null)
// CHECK1-NEXT: [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0
// CHECK1-NEXT: br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__1_EXIT:%.*]]
// CHECK1: omp_offload.failed.i:
@@ -2441,7 +2441,7 @@ int bar(int n){
// CHECK3-NEXT: store i8** null, i8*** [[TMP34]], align 4, !noalias !25
// CHECK3-NEXT: [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 8
// CHECK3-NEXT: store i64 0, i64* [[TMP35]], align 8, !noalias !25
-// CHECK3-NEXT: [[TMP36:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB1]], i64 -1, i32 [[TMP25]], i32 [[TMP26]], i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l101.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null) #[[ATTR3]]
+// CHECK3-NEXT: [[TMP36:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB1]], i64 -1, i32 [[TMP25]], i32 [[TMP26]], i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l101.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null)
// CHECK3-NEXT: [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0
// CHECK3-NEXT: br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__1_EXIT:%.*]]
// CHECK3: omp_offload.failed.i:
diff --git a/clang/test/OpenMP/target_teams_distribute_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_codegen.cpp
index de75e2118e6c8..7d26aebb0d563 100644
--- a/clang/test/OpenMP/target_teams_distribute_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_codegen.cpp
@@ -866,7 +866,7 @@ int bar(int n){
// CHECK1-NEXT: store i8** null, i8*** [[TMP34]], align 8, !noalias !21
// CHECK1-NEXT: [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 8
// CHECK1-NEXT: store i64 10, i64* [[TMP35]], align 8, !noalias !21
-// CHECK1-NEXT: [[TMP36:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 [[TMP25]], i32 [[TMP26]], i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l103.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null) #[[ATTR3]]
+// CHECK1-NEXT: [[TMP36:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 [[TMP25]], i32 [[TMP26]], i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l103.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null)
// CHECK1-NEXT: [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0
// CHECK1-NEXT: br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__1_EXIT:%.*]]
// CHECK1: omp_offload.failed.i:
@@ -2684,7 +2684,7 @@ int bar(int n){
// CHECK3-NEXT: store i8** null, i8*** [[TMP34]], align 4, !noalias !22
// CHECK3-NEXT: [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 8
// CHECK3-NEXT: store i64 10, i64* [[TMP35]], align 8, !noalias !22
-// CHECK3-NEXT: [[TMP36:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 [[TMP25]], i32 [[TMP26]], i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l103.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null) #[[ATTR3]]
+// CHECK3-NEXT: [[TMP36:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 [[TMP25]], i32 [[TMP26]], i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l103.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null)
// CHECK3-NEXT: [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0
// CHECK3-NEXT: br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__1_EXIT:%.*]]
// CHECK3: omp_offload.failed.i:
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_task_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_task_codegen.cpp
index 2bcebc0f24e71..b48d34c301f50 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_task_codegen.cpp
@@ -832,7 +832,7 @@ int main(int argc, char **argv) {
// CHECK1-NEXT: [[TMP19:%.*]] = load i8*, i8** [[TMP16]], align 8
// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !12
// CHECK1-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP18]] to i8*
-// CHECK1-NEXT: [[TMP22:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP19]], i8* [[TMP21]]) #[[ATTR5]]
+// CHECK1-NEXT: [[TMP22:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP19]], i8* [[TMP21]])
// CHECK1-NEXT: [[CONV_I:%.*]] = bitcast i8* [[TMP22]] to i32*
// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 2
// CHECK1-NEXT: [[TMP24:%.*]] = load i8**, i8*** [[TMP23]], align 8
@@ -855,7 +855,7 @@ int main(int argc, char **argv) {
// CHECK1-NEXT: [[TMP38:%.*]] = mul nuw i64 [[TMP37]], ptrtoint (i8* getelementptr (i8, i8* null, i32 1) to i64)
// CHECK1-NEXT: store i64 [[TMP37]], i64* @{{reduction_size[.].+[.]}}, align 8, !noalias !12
// CHECK1-NEXT: [[TMP39:%.*]] = load i8*, i8** [[TMP16]], align 8
-// CHECK1-NEXT: [[TMP40:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP39]], i8* [[TMP25]]) #[[ATTR5]]
+// CHECK1-NEXT: [[TMP40:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP39]], i8* [[TMP25]])
// CHECK1-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 2
// CHECK1-NEXT: [[TMP42:%.*]] = load i8**, i8*** [[TMP41]], align 8
// CHECK1-NEXT: [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 8
diff --git a/clang/test/OpenMP/target_teams_distribute_simd_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_simd_codegen.cpp
index 58a0757245d34..00106a686c7fa 100644
--- a/clang/test/OpenMP/target_teams_distribute_simd_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_simd_codegen.cpp
@@ -855,7 +855,7 @@ int bar(int n){
// CHECK1-NEXT: store i8** null, i8*** [[TMP33]], align 8, !noalias !26
// CHECK1-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 8
// CHECK1-NEXT: store i64 10, i64* [[TMP34]], align 8, !noalias !26
-// CHECK1-NEXT: [[TMP35:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 [[TMP25]], i32 1, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l97.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null) #[[ATTR4]]
+// CHECK1-NEXT: [[TMP35:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 [[TMP25]], i32 1, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l97.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null)
// CHECK1-NEXT: [[TMP36:%.*]] = icmp ne i32 [[TMP35]], 0
// CHECK1-NEXT: br i1 [[TMP36]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__1_EXIT:%.*]]
// CHECK1: omp_offload.failed.i:
@@ -2695,7 +2695,7 @@ int bar(int n){
// CHECK3-NEXT: store i8** null, i8*** [[TMP33]], align 4, !noalias !27
// CHECK3-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 8
// CHECK3-NEXT: store i64 10, i64* [[TMP34]], align 8, !noalias !27
-// CHECK3-NEXT: [[TMP35:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 [[TMP25]], i32 1, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l97.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null) #[[ATTR4]]
+// CHECK3-NEXT: [[TMP35:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 [[TMP25]], i32 1, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l97.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null)
// CHECK3-NEXT: [[TMP36:%.*]] = icmp ne i32 [[TMP35]], 0
// CHECK3-NEXT: br i1 [[TMP36]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__1_EXIT:%.*]]
// CHECK3: omp_offload.failed.i:
@@ -4515,7 +4515,7 @@ int bar(int n){
// CHECK5-NEXT: store i8** null, i8*** [[TMP33]], align 8, !noalias !26
// CHECK5-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 8
// CHECK5-NEXT: store i64 10, i64* [[TMP34]], align 8, !noalias !26
-// CHECK5-NEXT: [[TMP35:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 [[TMP25]], i32 1, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l97.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null) #[[ATTR4]]
+// CHECK5-NEXT: [[TMP35:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 [[TMP25]], i32 1, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l97.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null)
// CHECK5-NEXT: [[TMP36:%.*]] = icmp ne i32 [[TMP35]], 0
// CHECK5-NEXT: br i1 [[TMP36]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__1_EXIT:%.*]]
// CHECK5: omp_offload.failed.i:
@@ -6430,7 +6430,7 @@ int bar(int n){
// CHECK7-NEXT: store i8** null, i8*** [[TMP33]], align 4, !noalias !27
// CHECK7-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 8
// CHECK7-NEXT: store i64 10, i64* [[TMP34]], align 8, !noalias !27
-// CHECK7-NEXT: [[TMP35:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 [[TMP25]], i32 1, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l97.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null) #[[ATTR4]]
+// CHECK7-NEXT: [[TMP35:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 [[TMP25]], i32 1, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l97.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null)
// CHECK7-NEXT: [[TMP36:%.*]] = icmp ne i32 [[TMP35]], 0
// CHECK7-NEXT: br i1 [[TMP36]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__1_EXIT:%.*]]
// CHECK7: omp_offload.failed.i:
diff --git a/clang/test/OpenMP/task_codegen.cpp b/clang/test/OpenMP/task_codegen.cpp
index e4ef482e69c60..d9fb486f81e23 100644
--- a/clang/test/OpenMP/task_codegen.cpp
+++ b/clang/test/OpenMP/task_codegen.cpp
@@ -658,13 +658,13 @@ void test_omp_all_memory()
// CHECK1-NEXT: store i32 1, i32* [[TMP13]], align 4
// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !32
// CHECK1-NEXT: [[TMP15:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !32
-// CHECK1-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP14]], i8* [[TMP15]]) #[[ATTR4]]
+// CHECK1-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP14]], i8* [[TMP15]])
// CHECK1-NEXT: br label [[DOTOMP_OUTLINED__3_EXIT:%.*]]
// CHECK1: .untied.jmp.1.i:
// CHECK1-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !32
-// CHECK1-NEXT: call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP17]], [8 x i32]* @.gomp_critical_user_.var) #[[ATTR4]]
+// CHECK1-NEXT: call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP17]], [8 x i32]* @.gomp_critical_user_.var)
// CHECK1-NEXT: store i32 1, i32* @a, align 4, !noalias !32
-// CHECK1-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP17]], [8 x i32]* @.gomp_critical_user_.var) #[[ATTR4]]
+// CHECK1-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP17]], [8 x i32]* @.gomp_critical_user_.var)
// CHECK1-NEXT: store i32 0, i32* [[CLEANUP_DEST_SLOT_I]], align 4, !noalias !32
// CHECK1-NEXT: br label [[CLEANUP_I]]
// CHECK1: cleanup.i:
@@ -721,7 +721,7 @@ void test_omp_all_memory()
// CHECK1-NEXT: store i32 1, i32* [[TMP13]], align 4
// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !42
// CHECK1-NEXT: [[TMP15:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !42
-// CHECK1-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP14]], i8* [[TMP15]]) #[[ATTR4]]
+// CHECK1-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP14]], i8* [[TMP15]])
// CHECK1-NEXT: br label [[DOTOMP_OUTLINED__5_EXIT:%.*]]
// CHECK1: .untied.jmp.1.i:
// CHECK1-NEXT: store i32 1, i32* @a, align 4, !noalias !42
@@ -781,7 +781,7 @@ void test_omp_all_memory()
// CHECK1-NEXT: store i32 1, i32* [[TMP13]], align 4
// CHECK1-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !52
// CHECK1-NEXT: [[TMP15:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !52
-// CHECK1-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP14]], i8* [[TMP15]]) #[[ATTR4]]
+// CHECK1-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP14]], i8* [[TMP15]])
// CHECK1-NEXT: br label [[DOTOMP_OUTLINED__7_EXIT:%.*]]
// CHECK1: .untied.jmp.1.i:
// CHECK1-NEXT: store i32 1, i32* @a, align 4, !noalias !52
@@ -1099,26 +1099,26 @@ void test_omp_all_memory()
// CHECK1-NEXT: store i32 1, i32* [[TMP22]], align 4
// CHECK1-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !112
// CHECK1-NEXT: [[TMP24:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !112
-// CHECK1-NEXT: [[TMP25:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP23]], i8* [[TMP24]]) #[[ATTR4]]
+// CHECK1-NEXT: [[TMP25:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP23]], i8* [[TMP24]])
// CHECK1-NEXT: br label [[DOTOMP_OUTLINED__17_EXIT:%.*]]
// CHECK1: .untied.jmp.2.i:
-// CHECK1-NEXT: call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[TMP17]]) #[[ATTR4]]
+// CHECK1-NEXT: call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[TMP17]])
// CHECK1-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !112
-// CHECK1-NEXT: [[DOTS2__VOID_ADDR_I:%.*]] = call i8* @__kmpc_alloc(i32 [[TMP26]], i64 4, i8* inttoptr (i64 7 to i8*)) #[[ATTR4]]
+// CHECK1-NEXT: [[DOTS2__VOID_ADDR_I:%.*]] = call i8* @__kmpc_alloc(i32 [[TMP26]], i64 4, i8* inttoptr (i64 7 to i8*))
// CHECK1-NEXT: [[DOTS2__ADDR_I:%.*]] = bitcast i8* [[DOTS2__VOID_ADDR_I]] to %struct.S*
// CHECK1-NEXT: store %struct.S* [[DOTS2__ADDR_I]], %struct.S** [[TMP18]], align 8
// CHECK1-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !112
// CHECK1-NEXT: store i32 2, i32* [[TMP27]], align 4
// CHECK1-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !112
// CHECK1-NEXT: [[TMP29:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !112
-// CHECK1-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]], i8* [[TMP29]]) #[[ATTR4]]
+// CHECK1-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]], i8* [[TMP29]])
// CHECK1-NEXT: br label [[DOTOMP_OUTLINED__17_EXIT]]
// CHECK1: .untied.jmp.3.i:
-// CHECK1-NEXT: call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[TMP19]]) #[[ATTR4]]
+// CHECK1-NEXT: call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[TMP19]])
// CHECK1-NEXT: [[A_I:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[TMP19]], i32 0, i32 0
// CHECK1-NEXT: store i32 0, i32* [[A_I]], align 4
// CHECK1-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !112
-// CHECK1-NEXT: [[TMP32:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB1]], i32 [[TMP31]], i32 1, i64 256, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates.18*)* @.omp_task_entry..19 to i32 (i32, i8*)*)) #[[ATTR4]]
+// CHECK1-NEXT: [[TMP32:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB1]], i32 [[TMP31]], i32 1, i64 256, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates.18*)* @.omp_task_entry..19 to i32 (i32, i8*)*))
// CHECK1-NEXT: [[TMP33:%.*]] = bitcast i8* [[TMP32]] to %struct.kmp_task_t_with_privates.18*
// CHECK1-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_18:%.*]], %struct.kmp_task_t_with_privates.18* [[TMP33]], i32 0, i32 0
// CHECK1-NEXT: [[TMP35:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_18]], %struct.kmp_task_t_with_privates.18* [[TMP33]], i32 0, i32 2
@@ -1126,43 +1126,43 @@ void test_omp_all_memory()
// CHECK1-NEXT: [[TMP37:%.*]] = load i32, i32* [[TMP16]], align 128
// CHECK1-NEXT: store i32 [[TMP37]], i32* [[TMP36]], align 128
// CHECK1-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !112
-// CHECK1-NEXT: [[TMP39:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP38]], i8* [[TMP32]]) #[[ATTR4]]
+// CHECK1-NEXT: [[TMP39:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP38]], i8* [[TMP32]])
// CHECK1-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !112
// CHECK1-NEXT: store i32 3, i32* [[TMP40]], align 4
// CHECK1-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !112
// CHECK1-NEXT: [[TMP42:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !112
-// CHECK1-NEXT: [[TMP43:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]], i8* [[TMP42]]) #[[ATTR4]]
+// CHECK1-NEXT: [[TMP43:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]], i8* [[TMP42]])
// CHECK1-NEXT: br label [[DOTOMP_OUTLINED__17_EXIT]]
// CHECK1: .untied.jmp.5.i:
// CHECK1-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !112
-// CHECK1-NEXT: [[TMP45:%.*]] = call i32 @__kmpc_omp_taskyield(%struct.ident_t* @[[GLOB1]], i32 [[TMP44]], i32 0) #[[ATTR4]]
+// CHECK1-NEXT: [[TMP45:%.*]] = call i32 @__kmpc_omp_taskyield(%struct.ident_t* @[[GLOB1]], i32 [[TMP44]], i32 0)
// CHECK1-NEXT: [[TMP46:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !112
// CHECK1-NEXT: store i32 4, i32* [[TMP46]], align 4
// CHECK1-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !112
// CHECK1-NEXT: [[TMP48:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !112
-// CHECK1-NEXT: [[TMP49:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP47]], i8* [[TMP48]]) #[[ATTR4]]
+// CHECK1-NEXT: [[TMP49:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP47]], i8* [[TMP48]])
// CHECK1-NEXT: br label [[DOTOMP_OUTLINED__17_EXIT]]
// CHECK1: .untied.jmp.7.i:
-// CHECK1-NEXT: call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[REF_TMP_I]]) #[[ATTR4]]
+// CHECK1-NEXT: call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[REF_TMP_I]])
// CHECK1-NEXT: [[TMP50:%.*]] = bitcast %struct.S* [[TMP17]] to i8*
// CHECK1-NEXT: [[TMP51:%.*]] = bitcast %struct.S* [[REF_TMP_I]] to i8*
-// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP50]], i8* align 4 [[TMP51]], i64 4, i1 false) #[[ATTR4]]
+// CHECK1-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP50]], i8* align 4 [[TMP51]], i64 4, i1 false)
// CHECK1-NEXT: call void @_ZN1SD1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[REF_TMP_I]]) #[[ATTR4]]
// CHECK1-NEXT: [[A9_I:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[TMP19]], i32 0, i32 0
// CHECK1-NEXT: store i32 10, i32* [[A9_I]], align 4
// CHECK1-NEXT: [[TMP52:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !112
-// CHECK1-NEXT: [[TMP53:%.*]] = call i32 @__kmpc_omp_taskwait(%struct.ident_t* @[[GLOB1]], i32 [[TMP52]]) #[[ATTR4]]
+// CHECK1-NEXT: [[TMP53:%.*]] = call i32 @__kmpc_omp_taskwait(%struct.ident_t* @[[GLOB1]], i32 [[TMP52]])
// CHECK1-NEXT: [[TMP54:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !112
// CHECK1-NEXT: store i32 5, i32* [[TMP54]], align 4
// CHECK1-NEXT: [[TMP55:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !112
// CHECK1-NEXT: [[TMP56:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !112
-// CHECK1-NEXT: [[TMP57:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP55]], i8* [[TMP56]]) #[[ATTR4]]
+// CHECK1-NEXT: [[TMP57:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP55]], i8* [[TMP56]])
// CHECK1-NEXT: br label [[DOTOMP_OUTLINED__17_EXIT]]
// CHECK1: .untied.jmp.10.i:
// CHECK1-NEXT: call void @_ZN1SD1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[TMP19]]) #[[ATTR4]]
// CHECK1-NEXT: [[TMP58:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !112
// CHECK1-NEXT: [[TMP59:%.*]] = bitcast %struct.S* [[TMP19]] to i8*
-// CHECK1-NEXT: call void @__kmpc_free(i32 [[TMP58]], i8* [[TMP59]], i8* inttoptr (i64 7 to i8*)) #[[ATTR4]]
+// CHECK1-NEXT: call void @__kmpc_free(i32 [[TMP58]], i8* [[TMP59]], i8* inttoptr (i64 7 to i8*))
// CHECK1-NEXT: call void @_ZN1SD1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[TMP17]]) #[[ATTR4]]
// CHECK1-NEXT: store i32 0, i32* [[CLEANUP_DEST_SLOT_I]], align 4, !noalias !112
// CHECK1-NEXT: br label [[CLEANUP_I]]
@@ -1850,13 +1850,13 @@ void test_omp_all_memory()
// CHECK2-NEXT: store i32 1, i32* [[TMP13]], align 4
// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !32
// CHECK2-NEXT: [[TMP15:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !32
-// CHECK2-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP14]], i8* [[TMP15]]) #[[ATTR4]]
+// CHECK2-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP14]], i8* [[TMP15]])
// CHECK2-NEXT: br label [[DOTOMP_OUTLINED__3_EXIT:%.*]]
// CHECK2: .untied.jmp.1.i:
// CHECK2-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !32
-// CHECK2-NEXT: call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP17]], [8 x i32]* @.gomp_critical_user_.var) #[[ATTR4]]
+// CHECK2-NEXT: call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP17]], [8 x i32]* @.gomp_critical_user_.var)
// CHECK2-NEXT: store i32 1, i32* @a, align 4, !noalias !32
-// CHECK2-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP17]], [8 x i32]* @.gomp_critical_user_.var) #[[ATTR4]]
+// CHECK2-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP17]], [8 x i32]* @.gomp_critical_user_.var)
// CHECK2-NEXT: store i32 0, i32* [[CLEANUP_DEST_SLOT_I]], align 4, !noalias !32
// CHECK2-NEXT: br label [[CLEANUP_I]]
// CHECK2: cleanup.i:
@@ -1913,7 +1913,7 @@ void test_omp_all_memory()
// CHECK2-NEXT: store i32 1, i32* [[TMP13]], align 4
// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !42
// CHECK2-NEXT: [[TMP15:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !42
-// CHECK2-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP14]], i8* [[TMP15]]) #[[ATTR4]]
+// CHECK2-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP14]], i8* [[TMP15]])
// CHECK2-NEXT: br label [[DOTOMP_OUTLINED__5_EXIT:%.*]]
// CHECK2: .untied.jmp.1.i:
// CHECK2-NEXT: store i32 1, i32* @a, align 4, !noalias !42
@@ -1973,7 +1973,7 @@ void test_omp_all_memory()
// CHECK2-NEXT: store i32 1, i32* [[TMP13]], align 4
// CHECK2-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !52
// CHECK2-NEXT: [[TMP15:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !52
-// CHECK2-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP14]], i8* [[TMP15]]) #[[ATTR4]]
+// CHECK2-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP14]], i8* [[TMP15]])
// CHECK2-NEXT: br label [[DOTOMP_OUTLINED__7_EXIT:%.*]]
// CHECK2: .untied.jmp.1.i:
// CHECK2-NEXT: store i32 1, i32* @a, align 4, !noalias !52
@@ -2291,26 +2291,26 @@ void test_omp_all_memory()
// CHECK2-NEXT: store i32 1, i32* [[TMP22]], align 4
// CHECK2-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !112
// CHECK2-NEXT: [[TMP24:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !112
-// CHECK2-NEXT: [[TMP25:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP23]], i8* [[TMP24]]) #[[ATTR4]]
+// CHECK2-NEXT: [[TMP25:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP23]], i8* [[TMP24]])
// CHECK2-NEXT: br label [[DOTOMP_OUTLINED__17_EXIT:%.*]]
// CHECK2: .untied.jmp.2.i:
-// CHECK2-NEXT: call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[TMP17]]) #[[ATTR4]]
+// CHECK2-NEXT: call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[TMP17]])
// CHECK2-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !112
-// CHECK2-NEXT: [[DOTS2__VOID_ADDR_I:%.*]] = call i8* @__kmpc_alloc(i32 [[TMP26]], i64 4, i8* inttoptr (i64 7 to i8*)) #[[ATTR4]]
+// CHECK2-NEXT: [[DOTS2__VOID_ADDR_I:%.*]] = call i8* @__kmpc_alloc(i32 [[TMP26]], i64 4, i8* inttoptr (i64 7 to i8*))
// CHECK2-NEXT: [[DOTS2__ADDR_I:%.*]] = bitcast i8* [[DOTS2__VOID_ADDR_I]] to %struct.S*
// CHECK2-NEXT: store %struct.S* [[DOTS2__ADDR_I]], %struct.S** [[TMP18]], align 8
// CHECK2-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !112
// CHECK2-NEXT: store i32 2, i32* [[TMP27]], align 4
// CHECK2-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !112
// CHECK2-NEXT: [[TMP29:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !112
-// CHECK2-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]], i8* [[TMP29]]) #[[ATTR4]]
+// CHECK2-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]], i8* [[TMP29]])
// CHECK2-NEXT: br label [[DOTOMP_OUTLINED__17_EXIT]]
// CHECK2: .untied.jmp.3.i:
-// CHECK2-NEXT: call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[TMP19]]) #[[ATTR4]]
+// CHECK2-NEXT: call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[TMP19]])
// CHECK2-NEXT: [[A_I:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[TMP19]], i32 0, i32 0
// CHECK2-NEXT: store i32 0, i32* [[A_I]], align 4
// CHECK2-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !112
-// CHECK2-NEXT: [[TMP32:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB1]], i32 [[TMP31]], i32 1, i64 256, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates.18*)* @.omp_task_entry..19 to i32 (i32, i8*)*)) #[[ATTR4]]
+// CHECK2-NEXT: [[TMP32:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB1]], i32 [[TMP31]], i32 1, i64 256, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates.18*)* @.omp_task_entry..19 to i32 (i32, i8*)*))
// CHECK2-NEXT: [[TMP33:%.*]] = bitcast i8* [[TMP32]] to %struct.kmp_task_t_with_privates.18*
// CHECK2-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_18:%.*]], %struct.kmp_task_t_with_privates.18* [[TMP33]], i32 0, i32 0
// CHECK2-NEXT: [[TMP35:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_18]], %struct.kmp_task_t_with_privates.18* [[TMP33]], i32 0, i32 2
@@ -2318,43 +2318,43 @@ void test_omp_all_memory()
// CHECK2-NEXT: [[TMP37:%.*]] = load i32, i32* [[TMP16]], align 128
// CHECK2-NEXT: store i32 [[TMP37]], i32* [[TMP36]], align 128
// CHECK2-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !112
-// CHECK2-NEXT: [[TMP39:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP38]], i8* [[TMP32]]) #[[ATTR4]]
+// CHECK2-NEXT: [[TMP39:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP38]], i8* [[TMP32]])
// CHECK2-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !112
// CHECK2-NEXT: store i32 3, i32* [[TMP40]], align 4
// CHECK2-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !112
// CHECK2-NEXT: [[TMP42:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !112
-// CHECK2-NEXT: [[TMP43:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]], i8* [[TMP42]]) #[[ATTR4]]
+// CHECK2-NEXT: [[TMP43:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]], i8* [[TMP42]])
// CHECK2-NEXT: br label [[DOTOMP_OUTLINED__17_EXIT]]
// CHECK2: .untied.jmp.5.i:
// CHECK2-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !112
-// CHECK2-NEXT: [[TMP45:%.*]] = call i32 @__kmpc_omp_taskyield(%struct.ident_t* @[[GLOB1]], i32 [[TMP44]], i32 0) #[[ATTR4]]
+// CHECK2-NEXT: [[TMP45:%.*]] = call i32 @__kmpc_omp_taskyield(%struct.ident_t* @[[GLOB1]], i32 [[TMP44]], i32 0)
// CHECK2-NEXT: [[TMP46:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !112
// CHECK2-NEXT: store i32 4, i32* [[TMP46]], align 4
// CHECK2-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !112
// CHECK2-NEXT: [[TMP48:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !112
-// CHECK2-NEXT: [[TMP49:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP47]], i8* [[TMP48]]) #[[ATTR4]]
+// CHECK2-NEXT: [[TMP49:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP47]], i8* [[TMP48]])
// CHECK2-NEXT: br label [[DOTOMP_OUTLINED__17_EXIT]]
// CHECK2: .untied.jmp.7.i:
-// CHECK2-NEXT: call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[REF_TMP_I]]) #[[ATTR4]]
+// CHECK2-NEXT: call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[REF_TMP_I]])
// CHECK2-NEXT: [[TMP50:%.*]] = bitcast %struct.S* [[TMP17]] to i8*
// CHECK2-NEXT: [[TMP51:%.*]] = bitcast %struct.S* [[REF_TMP_I]] to i8*
-// CHECK2-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP50]], i8* align 4 [[TMP51]], i64 4, i1 false) #[[ATTR4]]
+// CHECK2-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP50]], i8* align 4 [[TMP51]], i64 4, i1 false)
// CHECK2-NEXT: call void @_ZN1SD1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[REF_TMP_I]]) #[[ATTR4]]
// CHECK2-NEXT: [[A9_I:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[TMP19]], i32 0, i32 0
// CHECK2-NEXT: store i32 10, i32* [[A9_I]], align 4
// CHECK2-NEXT: [[TMP52:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !112
-// CHECK2-NEXT: [[TMP53:%.*]] = call i32 @__kmpc_omp_taskwait(%struct.ident_t* @[[GLOB1]], i32 [[TMP52]]) #[[ATTR4]]
+// CHECK2-NEXT: [[TMP53:%.*]] = call i32 @__kmpc_omp_taskwait(%struct.ident_t* @[[GLOB1]], i32 [[TMP52]])
// CHECK2-NEXT: [[TMP54:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !112
// CHECK2-NEXT: store i32 5, i32* [[TMP54]], align 4
// CHECK2-NEXT: [[TMP55:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !112
// CHECK2-NEXT: [[TMP56:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !112
-// CHECK2-NEXT: [[TMP57:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP55]], i8* [[TMP56]]) #[[ATTR4]]
+// CHECK2-NEXT: [[TMP57:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP55]], i8* [[TMP56]])
// CHECK2-NEXT: br label [[DOTOMP_OUTLINED__17_EXIT]]
// CHECK2: .untied.jmp.10.i:
// CHECK2-NEXT: call void @_ZN1SD1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[TMP19]]) #[[ATTR4]]
// CHECK2-NEXT: [[TMP58:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !112
// CHECK2-NEXT: [[TMP59:%.*]] = bitcast %struct.S* [[TMP19]] to i8*
-// CHECK2-NEXT: call void @__kmpc_free(i32 [[TMP58]], i8* [[TMP59]], i8* inttoptr (i64 7 to i8*)) #[[ATTR4]]
+// CHECK2-NEXT: call void @__kmpc_free(i32 [[TMP58]], i8* [[TMP59]], i8* inttoptr (i64 7 to i8*))
// CHECK2-NEXT: call void @_ZN1SD1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[TMP17]]) #[[ATTR4]]
// CHECK2-NEXT: store i32 0, i32* [[CLEANUP_DEST_SLOT_I]], align 4, !noalias !112
// CHECK2-NEXT: br label [[CLEANUP_I]]
@@ -3092,13 +3092,13 @@ void test_omp_all_memory()
// CHECK2-51-NEXT: store i32 1, i32* [[TMP13]], align 4
// CHECK2-51-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !32
// CHECK2-51-NEXT: [[TMP15:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !32
-// CHECK2-51-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP14]], i8* [[TMP15]]) #[[ATTR4]]
+// CHECK2-51-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP14]], i8* [[TMP15]])
// CHECK2-51-NEXT: br label [[DOTOMP_OUTLINED__3_EXIT:%.*]]
// CHECK2-51: .untied.jmp.1.i:
// CHECK2-51-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !32
-// CHECK2-51-NEXT: call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP17]], [8 x i32]* @.gomp_critical_user_.var) #[[ATTR4]]
+// CHECK2-51-NEXT: call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP17]], [8 x i32]* @.gomp_critical_user_.var)
// CHECK2-51-NEXT: store i32 1, i32* @a, align 4, !noalias !32
-// CHECK2-51-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP17]], [8 x i32]* @.gomp_critical_user_.var) #[[ATTR4]]
+// CHECK2-51-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP17]], [8 x i32]* @.gomp_critical_user_.var)
// CHECK2-51-NEXT: store i32 0, i32* [[CLEANUP_DEST_SLOT_I]], align 4, !noalias !32
// CHECK2-51-NEXT: br label [[CLEANUP_I]]
// CHECK2-51: cleanup.i:
@@ -3155,7 +3155,7 @@ void test_omp_all_memory()
// CHECK2-51-NEXT: store i32 1, i32* [[TMP13]], align 4
// CHECK2-51-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !42
// CHECK2-51-NEXT: [[TMP15:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !42
-// CHECK2-51-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP14]], i8* [[TMP15]]) #[[ATTR4]]
+// CHECK2-51-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP14]], i8* [[TMP15]])
// CHECK2-51-NEXT: br label [[DOTOMP_OUTLINED__5_EXIT:%.*]]
// CHECK2-51: .untied.jmp.1.i:
// CHECK2-51-NEXT: store i32 1, i32* @a, align 4, !noalias !42
@@ -3215,7 +3215,7 @@ void test_omp_all_memory()
// CHECK2-51-NEXT: store i32 1, i32* [[TMP13]], align 4
// CHECK2-51-NEXT: [[TMP14:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !52
// CHECK2-51-NEXT: [[TMP15:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !52
-// CHECK2-51-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP14]], i8* [[TMP15]]) #[[ATTR4]]
+// CHECK2-51-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP14]], i8* [[TMP15]])
// CHECK2-51-NEXT: br label [[DOTOMP_OUTLINED__7_EXIT:%.*]]
// CHECK2-51: .untied.jmp.1.i:
// CHECK2-51-NEXT: store i32 1, i32* @a, align 4, !noalias !52
@@ -3569,26 +3569,26 @@ void test_omp_all_memory()
// CHECK2-51-NEXT: store i32 1, i32* [[TMP22]], align 4
// CHECK2-51-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !122
// CHECK2-51-NEXT: [[TMP24:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !122
-// CHECK2-51-NEXT: [[TMP25:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP23]], i8* [[TMP24]]) #[[ATTR4]]
+// CHECK2-51-NEXT: [[TMP25:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP23]], i8* [[TMP24]])
// CHECK2-51-NEXT: br label [[DOTOMP_OUTLINED__19_EXIT:%.*]]
// CHECK2-51: .untied.jmp.2.i:
-// CHECK2-51-NEXT: call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[TMP17]]) #[[ATTR4]]
+// CHECK2-51-NEXT: call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[TMP17]])
// CHECK2-51-NEXT: [[TMP26:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !122
-// CHECK2-51-NEXT: [[DOTS2__VOID_ADDR_I:%.*]] = call i8* @__kmpc_alloc(i32 [[TMP26]], i64 4, i8* inttoptr (i64 7 to i8*)) #[[ATTR4]]
+// CHECK2-51-NEXT: [[DOTS2__VOID_ADDR_I:%.*]] = call i8* @__kmpc_alloc(i32 [[TMP26]], i64 4, i8* inttoptr (i64 7 to i8*))
// CHECK2-51-NEXT: [[DOTS2__ADDR_I:%.*]] = bitcast i8* [[DOTS2__VOID_ADDR_I]] to %struct.S*
// CHECK2-51-NEXT: store %struct.S* [[DOTS2__ADDR_I]], %struct.S** [[TMP18]], align 8
// CHECK2-51-NEXT: [[TMP27:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !122
// CHECK2-51-NEXT: store i32 2, i32* [[TMP27]], align 4
// CHECK2-51-NEXT: [[TMP28:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !122
// CHECK2-51-NEXT: [[TMP29:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !122
-// CHECK2-51-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]], i8* [[TMP29]]) #[[ATTR4]]
+// CHECK2-51-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]], i8* [[TMP29]])
// CHECK2-51-NEXT: br label [[DOTOMP_OUTLINED__19_EXIT]]
// CHECK2-51: .untied.jmp.3.i:
-// CHECK2-51-NEXT: call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[TMP19]]) #[[ATTR4]]
+// CHECK2-51-NEXT: call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[TMP19]])
// CHECK2-51-NEXT: [[A_I:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[TMP19]], i32 0, i32 0
// CHECK2-51-NEXT: store i32 0, i32* [[A_I]], align 4
// CHECK2-51-NEXT: [[TMP31:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !122
-// CHECK2-51-NEXT: [[TMP32:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB1]], i32 [[TMP31]], i32 1, i64 256, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates.20*)* @.omp_task_entry..21 to i32 (i32, i8*)*)) #[[ATTR4]]
+// CHECK2-51-NEXT: [[TMP32:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB1]], i32 [[TMP31]], i32 1, i64 256, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates.20*)* @.omp_task_entry..21 to i32 (i32, i8*)*))
// CHECK2-51-NEXT: [[TMP33:%.*]] = bitcast i8* [[TMP32]] to %struct.kmp_task_t_with_privates.20*
// CHECK2-51-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_20:%.*]], %struct.kmp_task_t_with_privates.20* [[TMP33]], i32 0, i32 0
// CHECK2-51-NEXT: [[TMP35:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_20]], %struct.kmp_task_t_with_privates.20* [[TMP33]], i32 0, i32 2
@@ -3596,43 +3596,43 @@ void test_omp_all_memory()
// CHECK2-51-NEXT: [[TMP37:%.*]] = load i32, i32* [[TMP16]], align 128
// CHECK2-51-NEXT: store i32 [[TMP37]], i32* [[TMP36]], align 128
// CHECK2-51-NEXT: [[TMP38:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !122
-// CHECK2-51-NEXT: [[TMP39:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP38]], i8* [[TMP32]]) #[[ATTR4]]
+// CHECK2-51-NEXT: [[TMP39:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP38]], i8* [[TMP32]])
// CHECK2-51-NEXT: [[TMP40:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !122
// CHECK2-51-NEXT: store i32 3, i32* [[TMP40]], align 4
// CHECK2-51-NEXT: [[TMP41:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !122
// CHECK2-51-NEXT: [[TMP42:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !122
-// CHECK2-51-NEXT: [[TMP43:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]], i8* [[TMP42]]) #[[ATTR4]]
+// CHECK2-51-NEXT: [[TMP43:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]], i8* [[TMP42]])
// CHECK2-51-NEXT: br label [[DOTOMP_OUTLINED__19_EXIT]]
// CHECK2-51: .untied.jmp.5.i:
// CHECK2-51-NEXT: [[TMP44:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !122
-// CHECK2-51-NEXT: [[TMP45:%.*]] = call i32 @__kmpc_omp_taskyield(%struct.ident_t* @[[GLOB1]], i32 [[TMP44]], i32 0) #[[ATTR4]]
+// CHECK2-51-NEXT: [[TMP45:%.*]] = call i32 @__kmpc_omp_taskyield(%struct.ident_t* @[[GLOB1]], i32 [[TMP44]], i32 0)
// CHECK2-51-NEXT: [[TMP46:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !122
// CHECK2-51-NEXT: store i32 4, i32* [[TMP46]], align 4
// CHECK2-51-NEXT: [[TMP47:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !122
// CHECK2-51-NEXT: [[TMP48:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !122
-// CHECK2-51-NEXT: [[TMP49:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP47]], i8* [[TMP48]]) #[[ATTR4]]
+// CHECK2-51-NEXT: [[TMP49:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP47]], i8* [[TMP48]])
// CHECK2-51-NEXT: br label [[DOTOMP_OUTLINED__19_EXIT]]
// CHECK2-51: .untied.jmp.7.i:
-// CHECK2-51-NEXT: call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[REF_TMP_I]]) #[[ATTR4]]
+// CHECK2-51-NEXT: call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[REF_TMP_I]])
// CHECK2-51-NEXT: [[TMP50:%.*]] = bitcast %struct.S* [[TMP17]] to i8*
// CHECK2-51-NEXT: [[TMP51:%.*]] = bitcast %struct.S* [[REF_TMP_I]] to i8*
-// CHECK2-51-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP50]], i8* align 4 [[TMP51]], i64 4, i1 false) #[[ATTR4]]
+// CHECK2-51-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP50]], i8* align 4 [[TMP51]], i64 4, i1 false)
// CHECK2-51-NEXT: call void @_ZN1SD1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[REF_TMP_I]]) #[[ATTR4]]
// CHECK2-51-NEXT: [[A9_I:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[TMP19]], i32 0, i32 0
// CHECK2-51-NEXT: store i32 10, i32* [[A9_I]], align 4
// CHECK2-51-NEXT: [[TMP52:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !122
-// CHECK2-51-NEXT: [[TMP53:%.*]] = call i32 @__kmpc_omp_taskwait(%struct.ident_t* @[[GLOB1]], i32 [[TMP52]]) #[[ATTR4]]
+// CHECK2-51-NEXT: [[TMP53:%.*]] = call i32 @__kmpc_omp_taskwait(%struct.ident_t* @[[GLOB1]], i32 [[TMP52]])
// CHECK2-51-NEXT: [[TMP54:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !122
// CHECK2-51-NEXT: store i32 5, i32* [[TMP54]], align 4
// CHECK2-51-NEXT: [[TMP55:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !122
// CHECK2-51-NEXT: [[TMP56:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !122
-// CHECK2-51-NEXT: [[TMP57:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP55]], i8* [[TMP56]]) #[[ATTR4]]
+// CHECK2-51-NEXT: [[TMP57:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP55]], i8* [[TMP56]])
// CHECK2-51-NEXT: br label [[DOTOMP_OUTLINED__19_EXIT]]
// CHECK2-51: .untied.jmp.10.i:
// CHECK2-51-NEXT: call void @_ZN1SD1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[TMP19]]) #[[ATTR4]]
// CHECK2-51-NEXT: [[TMP58:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !122
// CHECK2-51-NEXT: [[TMP59:%.*]] = bitcast %struct.S* [[TMP19]] to i8*
-// CHECK2-51-NEXT: call void @__kmpc_free(i32 [[TMP58]], i8* [[TMP59]], i8* inttoptr (i64 7 to i8*)) #[[ATTR4]]
+// CHECK2-51-NEXT: call void @__kmpc_free(i32 [[TMP58]], i8* [[TMP59]], i8* inttoptr (i64 7 to i8*))
// CHECK2-51-NEXT: call void @_ZN1SD1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[TMP17]]) #[[ATTR4]]
// CHECK2-51-NEXT: store i32 0, i32* [[CLEANUP_DEST_SLOT_I]], align 4, !noalias !122
// CHECK2-51-NEXT: br label [[CLEANUP_I]]
@@ -4776,15 +4776,15 @@ void test_omp_all_memory()
// CHECK3: .untied.jmp..i:
// CHECK3-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !32
// CHECK3-NEXT: store i32 1, i32* [[TMP13]], align 4
-// CHECK3-NEXT: [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB7]]) #[[ATTR4]]
+// CHECK3-NEXT: [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB7]])
// CHECK3-NEXT: [[TMP14:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !32
-// CHECK3-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP14]]) #[[ATTR4]]
+// CHECK3-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP14]])
// CHECK3-NEXT: br label [[DOTOMP_OUTLINED__3_EXIT:%.*]]
// CHECK3: .untied.jmp.1.i:
-// CHECK3-NEXT: [[OMP_GLOBAL_THREAD_NUM2_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]]
-// CHECK3-NEXT: call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM2_I]], [8 x i32]* @.gomp_critical_user_.var) #[[ATTR4]]
+// CHECK3-NEXT: [[OMP_GLOBAL_THREAD_NUM2_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK3-NEXT: call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM2_I]], [8 x i32]* @.gomp_critical_user_.var)
// CHECK3-NEXT: store i32 1, i32* @a, align 4, !noalias !32
-// CHECK3-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM2_I]], [8 x i32]* @.gomp_critical_user_.var) #[[ATTR4]]
+// CHECK3-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM2_I]], [8 x i32]* @.gomp_critical_user_.var)
// CHECK3-NEXT: store i32 0, i32* [[CLEANUP_DEST_SLOT_I]], align 4, !noalias !32
// CHECK3-NEXT: br label [[CLEANUP_I]]
// CHECK3: cleanup.i:
@@ -4839,9 +4839,9 @@ void test_omp_all_memory()
// CHECK3: .untied.jmp..i:
// CHECK3-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !42
// CHECK3-NEXT: store i32 1, i32* [[TMP13]], align 4
-// CHECK3-NEXT: [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB9]]) #[[ATTR4]]
+// CHECK3-NEXT: [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB9]])
// CHECK3-NEXT: [[TMP14:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !42
-// CHECK3-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP14]]) #[[ATTR4]]
+// CHECK3-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP14]])
// CHECK3-NEXT: br label [[DOTOMP_OUTLINED__5_EXIT:%.*]]
// CHECK3: .untied.jmp.1.i:
// CHECK3-NEXT: store i32 1, i32* @a, align 4, !noalias !42
@@ -4899,9 +4899,9 @@ void test_omp_all_memory()
// CHECK3: .untied.jmp..i:
// CHECK3-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !52
// CHECK3-NEXT: store i32 1, i32* [[TMP13]], align 4
-// CHECK3-NEXT: [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB11]]) #[[ATTR4]]
+// CHECK3-NEXT: [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB11]])
// CHECK3-NEXT: [[TMP14:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !52
-// CHECK3-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP14]]) #[[ATTR4]]
+// CHECK3-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP14]])
// CHECK3-NEXT: br label [[DOTOMP_OUTLINED__7_EXIT:%.*]]
// CHECK3: .untied.jmp.1.i:
// CHECK3-NEXT: store i32 1, i32* @a, align 4, !noalias !52
@@ -5217,55 +5217,55 @@ void test_omp_all_memory()
// CHECK3: .untied.jmp..i:
// CHECK3-NEXT: [[TMP21:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !112
// CHECK3-NEXT: store i32 1, i32* [[TMP21]], align 4
-// CHECK3-NEXT: [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB21]]) #[[ATTR4]]
+// CHECK3-NEXT: [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB21]])
// CHECK3-NEXT: [[TMP22:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !112
-// CHECK3-NEXT: [[TMP23:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP22]]) #[[ATTR4]]
+// CHECK3-NEXT: [[TMP23:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP22]])
// CHECK3-NEXT: br label [[DOTOMP_OUTLINED__17_EXIT:%.*]]
// CHECK3: .untied.jmp.2.i:
-// CHECK3-NEXT: call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[S1_I]]) #[[ATTR4]]
-// CHECK3-NEXT: call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[S2_I]]) #[[ATTR4]]
+// CHECK3-NEXT: call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[S1_I]])
+// CHECK3-NEXT: call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[S2_I]])
// CHECK3-NEXT: [[A_I:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[S2_I]], i32 0, i32 0
// CHECK3-NEXT: store i32 0, i32* [[A_I]], align 4, !noalias !112
-// CHECK3-NEXT: [[OMP_GLOBAL_THREAD_NUM3_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB23:[0-9]+]]) #[[ATTR4]]
-// CHECK3-NEXT: [[TMP24:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM3_I]], i32 1, i64 256, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates.18*)* @.omp_task_entry..19 to i32 (i32, i8*)*)) #[[ATTR4]]
+// CHECK3-NEXT: [[OMP_GLOBAL_THREAD_NUM3_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB23:[0-9]+]])
+// CHECK3-NEXT: [[TMP24:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM3_I]], i32 1, i64 256, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates.18*)* @.omp_task_entry..19 to i32 (i32, i8*)*))
// CHECK3-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP24]] to %struct.kmp_task_t_with_privates.18*
// CHECK3-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_18:%.*]], %struct.kmp_task_t_with_privates.18* [[TMP25]], i32 0, i32 0
// CHECK3-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_18]], %struct.kmp_task_t_with_privates.18* [[TMP25]], i32 0, i32 2
// CHECK3-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T:%.*]], %struct..kmp_privates.t* [[TMP27]], i32 0, i32 0
// CHECK3-NEXT: [[TMP29:%.*]] = load i32, i32* [[TMP16]], align 128
// CHECK3-NEXT: store i32 [[TMP29]], i32* [[TMP28]], align 128
-// CHECK3-NEXT: [[OMP_GLOBAL_THREAD_NUM4_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB23]]) #[[ATTR4]]
-// CHECK3-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM4_I]], i8* [[TMP24]]) #[[ATTR4]]
+// CHECK3-NEXT: [[OMP_GLOBAL_THREAD_NUM4_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB23]])
+// CHECK3-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM4_I]], i8* [[TMP24]])
// CHECK3-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !112
// CHECK3-NEXT: store i32 2, i32* [[TMP31]], align 4
-// CHECK3-NEXT: [[OMP_GLOBAL_THREAD_NUM5_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB21]]) #[[ATTR4]]
+// CHECK3-NEXT: [[OMP_GLOBAL_THREAD_NUM5_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB21]])
// CHECK3-NEXT: [[TMP32:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !112
-// CHECK3-NEXT: [[TMP33:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM5_I]], i8* [[TMP32]]) #[[ATTR4]]
+// CHECK3-NEXT: [[TMP33:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM5_I]], i8* [[TMP32]])
// CHECK3-NEXT: br label [[DOTOMP_OUTLINED__17_EXIT]]
// CHECK3: .untied.jmp.6.i:
-// CHECK3-NEXT: [[OMP_GLOBAL_THREAD_NUM8_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]]
-// CHECK3-NEXT: [[TMP34:%.*]] = call i32 @__kmpc_omp_taskyield(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM8_I]], i32 0) #[[ATTR4]]
+// CHECK3-NEXT: [[OMP_GLOBAL_THREAD_NUM8_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK3-NEXT: [[TMP34:%.*]] = call i32 @__kmpc_omp_taskyield(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM8_I]], i32 0)
// CHECK3-NEXT: [[TMP35:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !112
// CHECK3-NEXT: store i32 3, i32* [[TMP35]], align 4
-// CHECK3-NEXT: [[OMP_GLOBAL_THREAD_NUM9_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB21]]) #[[ATTR4]]
+// CHECK3-NEXT: [[OMP_GLOBAL_THREAD_NUM9_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB21]])
// CHECK3-NEXT: [[TMP36:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !112
-// CHECK3-NEXT: [[TMP37:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM9_I]], i8* [[TMP36]]) #[[ATTR4]]
+// CHECK3-NEXT: [[TMP37:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM9_I]], i8* [[TMP36]])
// CHECK3-NEXT: br label [[DOTOMP_OUTLINED__17_EXIT]]
// CHECK3: .untied.jmp.10.i:
-// CHECK3-NEXT: call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[REF_TMP_I]]) #[[ATTR4]]
+// CHECK3-NEXT: call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[REF_TMP_I]])
// CHECK3-NEXT: [[TMP38:%.*]] = bitcast %struct.S* [[S1_I]] to i8*
// CHECK3-NEXT: [[TMP39:%.*]] = bitcast %struct.S* [[REF_TMP_I]] to i8*
-// CHECK3-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP38]], i8* align 4 [[TMP39]], i64 4, i1 false) #[[ATTR4]], !noalias !112
+// CHECK3-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP38]], i8* align 4 [[TMP39]], i64 4, i1 false), !noalias !112
// CHECK3-NEXT: call void @_ZN1SD1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[REF_TMP_I]]) #[[ATTR4]]
// CHECK3-NEXT: [[A12_I:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[S2_I]], i32 0, i32 0
// CHECK3-NEXT: store i32 10, i32* [[A12_I]], align 4, !noalias !112
-// CHECK3-NEXT: [[OMP_GLOBAL_THREAD_NUM13_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]]
-// CHECK3-NEXT: [[TMP40:%.*]] = call i32 @__kmpc_omp_taskwait(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM13_I]]) #[[ATTR4]]
+// CHECK3-NEXT: [[OMP_GLOBAL_THREAD_NUM13_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK3-NEXT: [[TMP40:%.*]] = call i32 @__kmpc_omp_taskwait(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM13_I]])
// CHECK3-NEXT: [[TMP41:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !112
// CHECK3-NEXT: store i32 4, i32* [[TMP41]], align 4
-// CHECK3-NEXT: [[OMP_GLOBAL_THREAD_NUM14_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB21]]) #[[ATTR4]]
+// CHECK3-NEXT: [[OMP_GLOBAL_THREAD_NUM14_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB21]])
// CHECK3-NEXT: [[TMP42:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !112
-// CHECK3-NEXT: [[TMP43:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM14_I]], i8* [[TMP42]]) #[[ATTR4]]
+// CHECK3-NEXT: [[TMP43:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM14_I]], i8* [[TMP42]])
// CHECK3-NEXT: br label [[DOTOMP_OUTLINED__17_EXIT]]
// CHECK3: .untied.jmp.15.i:
// CHECK3-NEXT: call void @_ZN1SD1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[S2_I]]) #[[ATTR4]]
@@ -5855,15 +5855,15 @@ void test_omp_all_memory()
// CHECK4: .untied.jmp..i:
// CHECK4-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !32
// CHECK4-NEXT: store i32 1, i32* [[TMP13]], align 4
-// CHECK4-NEXT: [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB7]]) #[[ATTR4]]
+// CHECK4-NEXT: [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB7]])
// CHECK4-NEXT: [[TMP14:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !32
-// CHECK4-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP14]]) #[[ATTR4]]
+// CHECK4-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP14]])
// CHECK4-NEXT: br label [[DOTOMP_OUTLINED__3_EXIT:%.*]]
// CHECK4: .untied.jmp.1.i:
-// CHECK4-NEXT: [[OMP_GLOBAL_THREAD_NUM2_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]]
-// CHECK4-NEXT: call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM2_I]], [8 x i32]* @.gomp_critical_user_.var) #[[ATTR4]]
+// CHECK4-NEXT: [[OMP_GLOBAL_THREAD_NUM2_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK4-NEXT: call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM2_I]], [8 x i32]* @.gomp_critical_user_.var)
// CHECK4-NEXT: store i32 1, i32* @a, align 4, !noalias !32
-// CHECK4-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM2_I]], [8 x i32]* @.gomp_critical_user_.var) #[[ATTR4]]
+// CHECK4-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM2_I]], [8 x i32]* @.gomp_critical_user_.var)
// CHECK4-NEXT: store i32 0, i32* [[CLEANUP_DEST_SLOT_I]], align 4, !noalias !32
// CHECK4-NEXT: br label [[CLEANUP_I]]
// CHECK4: cleanup.i:
@@ -5918,9 +5918,9 @@ void test_omp_all_memory()
// CHECK4: .untied.jmp..i:
// CHECK4-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !42
// CHECK4-NEXT: store i32 1, i32* [[TMP13]], align 4
-// CHECK4-NEXT: [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB9]]) #[[ATTR4]]
+// CHECK4-NEXT: [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB9]])
// CHECK4-NEXT: [[TMP14:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !42
-// CHECK4-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP14]]) #[[ATTR4]]
+// CHECK4-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP14]])
// CHECK4-NEXT: br label [[DOTOMP_OUTLINED__5_EXIT:%.*]]
// CHECK4: .untied.jmp.1.i:
// CHECK4-NEXT: store i32 1, i32* @a, align 4, !noalias !42
@@ -5978,9 +5978,9 @@ void test_omp_all_memory()
// CHECK4: .untied.jmp..i:
// CHECK4-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !52
// CHECK4-NEXT: store i32 1, i32* [[TMP13]], align 4
-// CHECK4-NEXT: [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB11]]) #[[ATTR4]]
+// CHECK4-NEXT: [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB11]])
// CHECK4-NEXT: [[TMP14:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !52
-// CHECK4-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP14]]) #[[ATTR4]]
+// CHECK4-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP14]])
// CHECK4-NEXT: br label [[DOTOMP_OUTLINED__7_EXIT:%.*]]
// CHECK4: .untied.jmp.1.i:
// CHECK4-NEXT: store i32 1, i32* @a, align 4, !noalias !52
@@ -6296,55 +6296,55 @@ void test_omp_all_memory()
// CHECK4: .untied.jmp..i:
// CHECK4-NEXT: [[TMP21:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !112
// CHECK4-NEXT: store i32 1, i32* [[TMP21]], align 4
-// CHECK4-NEXT: [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB21]]) #[[ATTR4]]
+// CHECK4-NEXT: [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB21]])
// CHECK4-NEXT: [[TMP22:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !112
-// CHECK4-NEXT: [[TMP23:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP22]]) #[[ATTR4]]
+// CHECK4-NEXT: [[TMP23:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP22]])
// CHECK4-NEXT: br label [[DOTOMP_OUTLINED__17_EXIT:%.*]]
// CHECK4: .untied.jmp.2.i:
-// CHECK4-NEXT: call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[S1_I]]) #[[ATTR4]]
-// CHECK4-NEXT: call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[S2_I]]) #[[ATTR4]]
+// CHECK4-NEXT: call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[S1_I]])
+// CHECK4-NEXT: call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[S2_I]])
// CHECK4-NEXT: [[A_I:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[S2_I]], i32 0, i32 0
// CHECK4-NEXT: store i32 0, i32* [[A_I]], align 4, !noalias !112
-// CHECK4-NEXT: [[OMP_GLOBAL_THREAD_NUM3_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB23:[0-9]+]]) #[[ATTR4]]
-// CHECK4-NEXT: [[TMP24:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM3_I]], i32 1, i64 256, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates.18*)* @.omp_task_entry..19 to i32 (i32, i8*)*)) #[[ATTR4]]
+// CHECK4-NEXT: [[OMP_GLOBAL_THREAD_NUM3_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB23:[0-9]+]])
+// CHECK4-NEXT: [[TMP24:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM3_I]], i32 1, i64 256, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates.18*)* @.omp_task_entry..19 to i32 (i32, i8*)*))
// CHECK4-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP24]] to %struct.kmp_task_t_with_privates.18*
// CHECK4-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_18:%.*]], %struct.kmp_task_t_with_privates.18* [[TMP25]], i32 0, i32 0
// CHECK4-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_18]], %struct.kmp_task_t_with_privates.18* [[TMP25]], i32 0, i32 2
// CHECK4-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T:%.*]], %struct..kmp_privates.t* [[TMP27]], i32 0, i32 0
// CHECK4-NEXT: [[TMP29:%.*]] = load i32, i32* [[TMP16]], align 128
// CHECK4-NEXT: store i32 [[TMP29]], i32* [[TMP28]], align 128
-// CHECK4-NEXT: [[OMP_GLOBAL_THREAD_NUM4_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB23]]) #[[ATTR4]]
-// CHECK4-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM4_I]], i8* [[TMP24]]) #[[ATTR4]]
+// CHECK4-NEXT: [[OMP_GLOBAL_THREAD_NUM4_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB23]])
+// CHECK4-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM4_I]], i8* [[TMP24]])
// CHECK4-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !112
// CHECK4-NEXT: store i32 2, i32* [[TMP31]], align 4
-// CHECK4-NEXT: [[OMP_GLOBAL_THREAD_NUM5_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB21]]) #[[ATTR4]]
+// CHECK4-NEXT: [[OMP_GLOBAL_THREAD_NUM5_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB21]])
// CHECK4-NEXT: [[TMP32:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !112
-// CHECK4-NEXT: [[TMP33:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM5_I]], i8* [[TMP32]]) #[[ATTR4]]
+// CHECK4-NEXT: [[TMP33:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM5_I]], i8* [[TMP32]])
// CHECK4-NEXT: br label [[DOTOMP_OUTLINED__17_EXIT]]
// CHECK4: .untied.jmp.6.i:
-// CHECK4-NEXT: [[OMP_GLOBAL_THREAD_NUM8_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]]
-// CHECK4-NEXT: [[TMP34:%.*]] = call i32 @__kmpc_omp_taskyield(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM8_I]], i32 0) #[[ATTR4]]
+// CHECK4-NEXT: [[OMP_GLOBAL_THREAD_NUM8_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK4-NEXT: [[TMP34:%.*]] = call i32 @__kmpc_omp_taskyield(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM8_I]], i32 0)
// CHECK4-NEXT: [[TMP35:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !112
// CHECK4-NEXT: store i32 3, i32* [[TMP35]], align 4
-// CHECK4-NEXT: [[OMP_GLOBAL_THREAD_NUM9_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB21]]) #[[ATTR4]]
+// CHECK4-NEXT: [[OMP_GLOBAL_THREAD_NUM9_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB21]])
// CHECK4-NEXT: [[TMP36:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !112
-// CHECK4-NEXT: [[TMP37:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM9_I]], i8* [[TMP36]]) #[[ATTR4]]
+// CHECK4-NEXT: [[TMP37:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM9_I]], i8* [[TMP36]])
// CHECK4-NEXT: br label [[DOTOMP_OUTLINED__17_EXIT]]
// CHECK4: .untied.jmp.10.i:
-// CHECK4-NEXT: call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[REF_TMP_I]]) #[[ATTR4]]
+// CHECK4-NEXT: call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[REF_TMP_I]])
// CHECK4-NEXT: [[TMP38:%.*]] = bitcast %struct.S* [[S1_I]] to i8*
// CHECK4-NEXT: [[TMP39:%.*]] = bitcast %struct.S* [[REF_TMP_I]] to i8*
-// CHECK4-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP38]], i8* align 4 [[TMP39]], i64 4, i1 false) #[[ATTR4]], !noalias !112
+// CHECK4-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP38]], i8* align 4 [[TMP39]], i64 4, i1 false), !noalias !112
// CHECK4-NEXT: call void @_ZN1SD1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[REF_TMP_I]]) #[[ATTR4]]
// CHECK4-NEXT: [[A12_I:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[S2_I]], i32 0, i32 0
// CHECK4-NEXT: store i32 10, i32* [[A12_I]], align 4, !noalias !112
-// CHECK4-NEXT: [[OMP_GLOBAL_THREAD_NUM13_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]]
-// CHECK4-NEXT: [[TMP40:%.*]] = call i32 @__kmpc_omp_taskwait(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM13_I]]) #[[ATTR4]]
+// CHECK4-NEXT: [[OMP_GLOBAL_THREAD_NUM13_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK4-NEXT: [[TMP40:%.*]] = call i32 @__kmpc_omp_taskwait(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM13_I]])
// CHECK4-NEXT: [[TMP41:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !112
// CHECK4-NEXT: store i32 4, i32* [[TMP41]], align 4
-// CHECK4-NEXT: [[OMP_GLOBAL_THREAD_NUM14_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB21]]) #[[ATTR4]]
+// CHECK4-NEXT: [[OMP_GLOBAL_THREAD_NUM14_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB21]])
// CHECK4-NEXT: [[TMP42:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !112
-// CHECK4-NEXT: [[TMP43:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM14_I]], i8* [[TMP42]]) #[[ATTR4]]
+// CHECK4-NEXT: [[TMP43:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM14_I]], i8* [[TMP42]])
// CHECK4-NEXT: br label [[DOTOMP_OUTLINED__17_EXIT]]
// CHECK4: .untied.jmp.15.i:
// CHECK4-NEXT: call void @_ZN1SD1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[S2_I]]) #[[ATTR4]]
@@ -6986,15 +6986,15 @@ void test_omp_all_memory()
// CHECK3-51: .untied.jmp..i:
// CHECK3-51-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !32
// CHECK3-51-NEXT: store i32 1, i32* [[TMP13]], align 4
-// CHECK3-51-NEXT: [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB7]]) #[[ATTR4]]
+// CHECK3-51-NEXT: [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB7]])
// CHECK3-51-NEXT: [[TMP14:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !32
-// CHECK3-51-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP14]]) #[[ATTR4]]
+// CHECK3-51-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP14]])
// CHECK3-51-NEXT: br label [[DOTOMP_OUTLINED__3_EXIT:%.*]]
// CHECK3-51: .untied.jmp.1.i:
-// CHECK3-51-NEXT: [[OMP_GLOBAL_THREAD_NUM2_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]]
-// CHECK3-51-NEXT: call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM2_I]], [8 x i32]* @.gomp_critical_user_.var) #[[ATTR4]]
+// CHECK3-51-NEXT: [[OMP_GLOBAL_THREAD_NUM2_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK3-51-NEXT: call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM2_I]], [8 x i32]* @.gomp_critical_user_.var)
// CHECK3-51-NEXT: store i32 1, i32* @a, align 4, !noalias !32
-// CHECK3-51-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM2_I]], [8 x i32]* @.gomp_critical_user_.var) #[[ATTR4]]
+// CHECK3-51-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM2_I]], [8 x i32]* @.gomp_critical_user_.var)
// CHECK3-51-NEXT: store i32 0, i32* [[CLEANUP_DEST_SLOT_I]], align 4, !noalias !32
// CHECK3-51-NEXT: br label [[CLEANUP_I]]
// CHECK3-51: cleanup.i:
@@ -7049,9 +7049,9 @@ void test_omp_all_memory()
// CHECK3-51: .untied.jmp..i:
// CHECK3-51-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !42
// CHECK3-51-NEXT: store i32 1, i32* [[TMP13]], align 4
-// CHECK3-51-NEXT: [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB9]]) #[[ATTR4]]
+// CHECK3-51-NEXT: [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB9]])
// CHECK3-51-NEXT: [[TMP14:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !42
-// CHECK3-51-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP14]]) #[[ATTR4]]
+// CHECK3-51-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP14]])
// CHECK3-51-NEXT: br label [[DOTOMP_OUTLINED__5_EXIT:%.*]]
// CHECK3-51: .untied.jmp.1.i:
// CHECK3-51-NEXT: store i32 1, i32* @a, align 4, !noalias !42
@@ -7109,9 +7109,9 @@ void test_omp_all_memory()
// CHECK3-51: .untied.jmp..i:
// CHECK3-51-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !52
// CHECK3-51-NEXT: store i32 1, i32* [[TMP13]], align 4
-// CHECK3-51-NEXT: [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB11]]) #[[ATTR4]]
+// CHECK3-51-NEXT: [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB11]])
// CHECK3-51-NEXT: [[TMP14:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !52
-// CHECK3-51-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP14]]) #[[ATTR4]]
+// CHECK3-51-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP14]])
// CHECK3-51-NEXT: br label [[DOTOMP_OUTLINED__7_EXIT:%.*]]
// CHECK3-51: .untied.jmp.1.i:
// CHECK3-51-NEXT: store i32 1, i32* @a, align 4, !noalias !52
@@ -7463,55 +7463,55 @@ void test_omp_all_memory()
// CHECK3-51: .untied.jmp..i:
// CHECK3-51-NEXT: [[TMP21:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !122
// CHECK3-51-NEXT: store i32 1, i32* [[TMP21]], align 4
-// CHECK3-51-NEXT: [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB23]]) #[[ATTR4]]
+// CHECK3-51-NEXT: [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB23]])
// CHECK3-51-NEXT: [[TMP22:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !122
-// CHECK3-51-NEXT: [[TMP23:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP22]]) #[[ATTR4]]
+// CHECK3-51-NEXT: [[TMP23:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP22]])
// CHECK3-51-NEXT: br label [[DOTOMP_OUTLINED__19_EXIT:%.*]]
// CHECK3-51: .untied.jmp.2.i:
-// CHECK3-51-NEXT: call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[S1_I]]) #[[ATTR4]]
-// CHECK3-51-NEXT: call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[S2_I]]) #[[ATTR4]]
+// CHECK3-51-NEXT: call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[S1_I]])
+// CHECK3-51-NEXT: call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[S2_I]])
// CHECK3-51-NEXT: [[A_I:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[S2_I]], i32 0, i32 0
// CHECK3-51-NEXT: store i32 0, i32* [[A_I]], align 4, !noalias !122
-// CHECK3-51-NEXT: [[OMP_GLOBAL_THREAD_NUM3_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB25:[0-9]+]]) #[[ATTR4]]
-// CHECK3-51-NEXT: [[TMP24:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM3_I]], i32 1, i64 256, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates.20*)* @.omp_task_entry..21 to i32 (i32, i8*)*)) #[[ATTR4]]
+// CHECK3-51-NEXT: [[OMP_GLOBAL_THREAD_NUM3_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB25:[0-9]+]])
+// CHECK3-51-NEXT: [[TMP24:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM3_I]], i32 1, i64 256, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates.20*)* @.omp_task_entry..21 to i32 (i32, i8*)*))
// CHECK3-51-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP24]] to %struct.kmp_task_t_with_privates.20*
// CHECK3-51-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_20:%.*]], %struct.kmp_task_t_with_privates.20* [[TMP25]], i32 0, i32 0
// CHECK3-51-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_20]], %struct.kmp_task_t_with_privates.20* [[TMP25]], i32 0, i32 2
// CHECK3-51-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T:%.*]], %struct..kmp_privates.t* [[TMP27]], i32 0, i32 0
// CHECK3-51-NEXT: [[TMP29:%.*]] = load i32, i32* [[TMP16]], align 128
// CHECK3-51-NEXT: store i32 [[TMP29]], i32* [[TMP28]], align 128
-// CHECK3-51-NEXT: [[OMP_GLOBAL_THREAD_NUM4_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB25]]) #[[ATTR4]]
-// CHECK3-51-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM4_I]], i8* [[TMP24]]) #[[ATTR4]]
+// CHECK3-51-NEXT: [[OMP_GLOBAL_THREAD_NUM4_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB25]])
+// CHECK3-51-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM4_I]], i8* [[TMP24]])
// CHECK3-51-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !122
// CHECK3-51-NEXT: store i32 2, i32* [[TMP31]], align 4
-// CHECK3-51-NEXT: [[OMP_GLOBAL_THREAD_NUM5_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB23]]) #[[ATTR4]]
+// CHECK3-51-NEXT: [[OMP_GLOBAL_THREAD_NUM5_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB23]])
// CHECK3-51-NEXT: [[TMP32:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !122
-// CHECK3-51-NEXT: [[TMP33:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM5_I]], i8* [[TMP32]]) #[[ATTR4]]
+// CHECK3-51-NEXT: [[TMP33:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM5_I]], i8* [[TMP32]])
// CHECK3-51-NEXT: br label [[DOTOMP_OUTLINED__19_EXIT]]
// CHECK3-51: .untied.jmp.6.i:
-// CHECK3-51-NEXT: [[OMP_GLOBAL_THREAD_NUM8_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]]
-// CHECK3-51-NEXT: [[TMP34:%.*]] = call i32 @__kmpc_omp_taskyield(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM8_I]], i32 0) #[[ATTR4]]
+// CHECK3-51-NEXT: [[OMP_GLOBAL_THREAD_NUM8_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK3-51-NEXT: [[TMP34:%.*]] = call i32 @__kmpc_omp_taskyield(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM8_I]], i32 0)
// CHECK3-51-NEXT: [[TMP35:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !122
// CHECK3-51-NEXT: store i32 3, i32* [[TMP35]], align 4
-// CHECK3-51-NEXT: [[OMP_GLOBAL_THREAD_NUM9_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB23]]) #[[ATTR4]]
+// CHECK3-51-NEXT: [[OMP_GLOBAL_THREAD_NUM9_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB23]])
// CHECK3-51-NEXT: [[TMP36:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !122
-// CHECK3-51-NEXT: [[TMP37:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM9_I]], i8* [[TMP36]]) #[[ATTR4]]
+// CHECK3-51-NEXT: [[TMP37:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM9_I]], i8* [[TMP36]])
// CHECK3-51-NEXT: br label [[DOTOMP_OUTLINED__19_EXIT]]
// CHECK3-51: .untied.jmp.10.i:
-// CHECK3-51-NEXT: call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[REF_TMP_I]]) #[[ATTR4]]
+// CHECK3-51-NEXT: call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[REF_TMP_I]])
// CHECK3-51-NEXT: [[TMP38:%.*]] = bitcast %struct.S* [[S1_I]] to i8*
// CHECK3-51-NEXT: [[TMP39:%.*]] = bitcast %struct.S* [[REF_TMP_I]] to i8*
-// CHECK3-51-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP38]], i8* align 4 [[TMP39]], i64 4, i1 false) #[[ATTR4]], !noalias !122
+// CHECK3-51-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP38]], i8* align 4 [[TMP39]], i64 4, i1 false), !noalias !122
// CHECK3-51-NEXT: call void @_ZN1SD1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[REF_TMP_I]]) #[[ATTR4]]
// CHECK3-51-NEXT: [[A12_I:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[S2_I]], i32 0, i32 0
// CHECK3-51-NEXT: store i32 10, i32* [[A12_I]], align 4, !noalias !122
-// CHECK3-51-NEXT: [[OMP_GLOBAL_THREAD_NUM13_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]]
-// CHECK3-51-NEXT: [[TMP40:%.*]] = call i32 @__kmpc_omp_taskwait(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM13_I]]) #[[ATTR4]]
+// CHECK3-51-NEXT: [[OMP_GLOBAL_THREAD_NUM13_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK3-51-NEXT: [[TMP40:%.*]] = call i32 @__kmpc_omp_taskwait(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM13_I]])
// CHECK3-51-NEXT: [[TMP41:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !122
// CHECK3-51-NEXT: store i32 4, i32* [[TMP41]], align 4
-// CHECK3-51-NEXT: [[OMP_GLOBAL_THREAD_NUM14_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB23]]) #[[ATTR4]]
+// CHECK3-51-NEXT: [[OMP_GLOBAL_THREAD_NUM14_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB23]])
// CHECK3-51-NEXT: [[TMP42:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !122
-// CHECK3-51-NEXT: [[TMP43:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM14_I]], i8* [[TMP42]]) #[[ATTR4]]
+// CHECK3-51-NEXT: [[TMP43:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM14_I]], i8* [[TMP42]])
// CHECK3-51-NEXT: br label [[DOTOMP_OUTLINED__19_EXIT]]
// CHECK3-51: .untied.jmp.15.i:
// CHECK3-51-NEXT: call void @_ZN1SD1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[S2_I]]) #[[ATTR4]]
@@ -8549,15 +8549,15 @@ void test_omp_all_memory()
// CHECK4-51: .untied.jmp..i:
// CHECK4-51-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !32
// CHECK4-51-NEXT: store i32 1, i32* [[TMP13]], align 4
-// CHECK4-51-NEXT: [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB7]]) #[[ATTR4]]
+// CHECK4-51-NEXT: [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB7]])
// CHECK4-51-NEXT: [[TMP14:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !32
-// CHECK4-51-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP14]]) #[[ATTR4]]
+// CHECK4-51-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP14]])
// CHECK4-51-NEXT: br label [[DOTOMP_OUTLINED__3_EXIT:%.*]]
// CHECK4-51: .untied.jmp.1.i:
-// CHECK4-51-NEXT: [[OMP_GLOBAL_THREAD_NUM2_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]]
-// CHECK4-51-NEXT: call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM2_I]], [8 x i32]* @.gomp_critical_user_.var) #[[ATTR4]]
+// CHECK4-51-NEXT: [[OMP_GLOBAL_THREAD_NUM2_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK4-51-NEXT: call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM2_I]], [8 x i32]* @.gomp_critical_user_.var)
// CHECK4-51-NEXT: store i32 1, i32* @a, align 4, !noalias !32
-// CHECK4-51-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM2_I]], [8 x i32]* @.gomp_critical_user_.var) #[[ATTR4]]
+// CHECK4-51-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM2_I]], [8 x i32]* @.gomp_critical_user_.var)
// CHECK4-51-NEXT: store i32 0, i32* [[CLEANUP_DEST_SLOT_I]], align 4, !noalias !32
// CHECK4-51-NEXT: br label [[CLEANUP_I]]
// CHECK4-51: cleanup.i:
@@ -8612,9 +8612,9 @@ void test_omp_all_memory()
// CHECK4-51: .untied.jmp..i:
// CHECK4-51-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !42
// CHECK4-51-NEXT: store i32 1, i32* [[TMP13]], align 4
-// CHECK4-51-NEXT: [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB9]]) #[[ATTR4]]
+// CHECK4-51-NEXT: [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB9]])
// CHECK4-51-NEXT: [[TMP14:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !42
-// CHECK4-51-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP14]]) #[[ATTR4]]
+// CHECK4-51-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP14]])
// CHECK4-51-NEXT: br label [[DOTOMP_OUTLINED__5_EXIT:%.*]]
// CHECK4-51: .untied.jmp.1.i:
// CHECK4-51-NEXT: store i32 1, i32* @a, align 4, !noalias !42
@@ -8672,9 +8672,9 @@ void test_omp_all_memory()
// CHECK4-51: .untied.jmp..i:
// CHECK4-51-NEXT: [[TMP13:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !52
// CHECK4-51-NEXT: store i32 1, i32* [[TMP13]], align 4
-// CHECK4-51-NEXT: [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB11]]) #[[ATTR4]]
+// CHECK4-51-NEXT: [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB11]])
// CHECK4-51-NEXT: [[TMP14:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !52
-// CHECK4-51-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP14]]) #[[ATTR4]]
+// CHECK4-51-NEXT: [[TMP15:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP14]])
// CHECK4-51-NEXT: br label [[DOTOMP_OUTLINED__7_EXIT:%.*]]
// CHECK4-51: .untied.jmp.1.i:
// CHECK4-51-NEXT: store i32 1, i32* @a, align 4, !noalias !52
@@ -8990,55 +8990,55 @@ void test_omp_all_memory()
// CHECK4-51: .untied.jmp..i:
// CHECK4-51-NEXT: [[TMP21:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !112
// CHECK4-51-NEXT: store i32 1, i32* [[TMP21]], align 4
-// CHECK4-51-NEXT: [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB21]]) #[[ATTR4]]
+// CHECK4-51-NEXT: [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB21]])
// CHECK4-51-NEXT: [[TMP22:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !112
-// CHECK4-51-NEXT: [[TMP23:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP22]]) #[[ATTR4]]
+// CHECK4-51-NEXT: [[TMP23:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP22]])
// CHECK4-51-NEXT: br label [[DOTOMP_OUTLINED__17_EXIT:%.*]]
// CHECK4-51: .untied.jmp.2.i:
-// CHECK4-51-NEXT: call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[S1_I]]) #[[ATTR4]]
-// CHECK4-51-NEXT: call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[S2_I]]) #[[ATTR4]]
+// CHECK4-51-NEXT: call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[S1_I]])
+// CHECK4-51-NEXT: call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[S2_I]])
// CHECK4-51-NEXT: [[A_I:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[S2_I]], i32 0, i32 0
// CHECK4-51-NEXT: store i32 0, i32* [[A_I]], align 4, !noalias !112
-// CHECK4-51-NEXT: [[OMP_GLOBAL_THREAD_NUM3_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB23:[0-9]+]]) #[[ATTR4]]
-// CHECK4-51-NEXT: [[TMP24:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM3_I]], i32 1, i64 256, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates.18*)* @.omp_task_entry..19 to i32 (i32, i8*)*)) #[[ATTR4]]
+// CHECK4-51-NEXT: [[OMP_GLOBAL_THREAD_NUM3_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB23:[0-9]+]])
+// CHECK4-51-NEXT: [[TMP24:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM3_I]], i32 1, i64 256, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates.18*)* @.omp_task_entry..19 to i32 (i32, i8*)*))
// CHECK4-51-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP24]] to %struct.kmp_task_t_with_privates.18*
// CHECK4-51-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_18:%.*]], %struct.kmp_task_t_with_privates.18* [[TMP25]], i32 0, i32 0
// CHECK4-51-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_18]], %struct.kmp_task_t_with_privates.18* [[TMP25]], i32 0, i32 2
// CHECK4-51-NEXT: [[TMP28:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T:%.*]], %struct..kmp_privates.t* [[TMP27]], i32 0, i32 0
// CHECK4-51-NEXT: [[TMP29:%.*]] = load i32, i32* [[TMP16]], align 128
// CHECK4-51-NEXT: store i32 [[TMP29]], i32* [[TMP28]], align 128
-// CHECK4-51-NEXT: [[OMP_GLOBAL_THREAD_NUM4_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB23]]) #[[ATTR4]]
-// CHECK4-51-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM4_I]], i8* [[TMP24]]) #[[ATTR4]]
+// CHECK4-51-NEXT: [[OMP_GLOBAL_THREAD_NUM4_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB23]])
+// CHECK4-51-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM4_I]], i8* [[TMP24]])
// CHECK4-51-NEXT: [[TMP31:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !112
// CHECK4-51-NEXT: store i32 2, i32* [[TMP31]], align 4
-// CHECK4-51-NEXT: [[OMP_GLOBAL_THREAD_NUM5_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB21]]) #[[ATTR4]]
+// CHECK4-51-NEXT: [[OMP_GLOBAL_THREAD_NUM5_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB21]])
// CHECK4-51-NEXT: [[TMP32:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !112
-// CHECK4-51-NEXT: [[TMP33:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM5_I]], i8* [[TMP32]]) #[[ATTR4]]
+// CHECK4-51-NEXT: [[TMP33:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM5_I]], i8* [[TMP32]])
// CHECK4-51-NEXT: br label [[DOTOMP_OUTLINED__17_EXIT]]
// CHECK4-51: .untied.jmp.6.i:
-// CHECK4-51-NEXT: [[OMP_GLOBAL_THREAD_NUM8_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]]
-// CHECK4-51-NEXT: [[TMP34:%.*]] = call i32 @__kmpc_omp_taskyield(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM8_I]], i32 0) #[[ATTR4]]
+// CHECK4-51-NEXT: [[OMP_GLOBAL_THREAD_NUM8_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK4-51-NEXT: [[TMP34:%.*]] = call i32 @__kmpc_omp_taskyield(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM8_I]], i32 0)
// CHECK4-51-NEXT: [[TMP35:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !112
// CHECK4-51-NEXT: store i32 3, i32* [[TMP35]], align 4
-// CHECK4-51-NEXT: [[OMP_GLOBAL_THREAD_NUM9_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB21]]) #[[ATTR4]]
+// CHECK4-51-NEXT: [[OMP_GLOBAL_THREAD_NUM9_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB21]])
// CHECK4-51-NEXT: [[TMP36:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !112
-// CHECK4-51-NEXT: [[TMP37:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM9_I]], i8* [[TMP36]]) #[[ATTR4]]
+// CHECK4-51-NEXT: [[TMP37:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM9_I]], i8* [[TMP36]])
// CHECK4-51-NEXT: br label [[DOTOMP_OUTLINED__17_EXIT]]
// CHECK4-51: .untied.jmp.10.i:
-// CHECK4-51-NEXT: call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[REF_TMP_I]]) #[[ATTR4]]
+// CHECK4-51-NEXT: call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[REF_TMP_I]])
// CHECK4-51-NEXT: [[TMP38:%.*]] = bitcast %struct.S* [[S1_I]] to i8*
// CHECK4-51-NEXT: [[TMP39:%.*]] = bitcast %struct.S* [[REF_TMP_I]] to i8*
-// CHECK4-51-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP38]], i8* align 4 [[TMP39]], i64 4, i1 false) #[[ATTR4]], !noalias !112
+// CHECK4-51-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP38]], i8* align 4 [[TMP39]], i64 4, i1 false), !noalias !112
// CHECK4-51-NEXT: call void @_ZN1SD1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[REF_TMP_I]]) #[[ATTR4]]
// CHECK4-51-NEXT: [[A12_I:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[S2_I]], i32 0, i32 0
// CHECK4-51-NEXT: store i32 10, i32* [[A12_I]], align 4, !noalias !112
-// CHECK4-51-NEXT: [[OMP_GLOBAL_THREAD_NUM13_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]]
-// CHECK4-51-NEXT: [[TMP40:%.*]] = call i32 @__kmpc_omp_taskwait(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM13_I]]) #[[ATTR4]]
+// CHECK4-51-NEXT: [[OMP_GLOBAL_THREAD_NUM13_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK4-51-NEXT: [[TMP40:%.*]] = call i32 @__kmpc_omp_taskwait(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM13_I]])
// CHECK4-51-NEXT: [[TMP41:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !112
// CHECK4-51-NEXT: store i32 4, i32* [[TMP41]], align 4
-// CHECK4-51-NEXT: [[OMP_GLOBAL_THREAD_NUM14_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB21]]) #[[ATTR4]]
+// CHECK4-51-NEXT: [[OMP_GLOBAL_THREAD_NUM14_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB21]])
// CHECK4-51-NEXT: [[TMP42:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !112
-// CHECK4-51-NEXT: [[TMP43:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM14_I]], i8* [[TMP42]]) #[[ATTR4]]
+// CHECK4-51-NEXT: [[TMP43:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM14_I]], i8* [[TMP42]])
// CHECK4-51-NEXT: br label [[DOTOMP_OUTLINED__17_EXIT]]
// CHECK4-51: .untied.jmp.15.i:
// CHECK4-51-NEXT: call void @_ZN1SD1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[S2_I]]) #[[ATTR4]]
diff --git a/clang/test/OpenMP/task_if_codegen.cpp b/clang/test/OpenMP/task_if_codegen.cpp
index 8fd4db5431856..e8ecee2d47753 100644
--- a/clang/test/OpenMP/task_if_codegen.cpp
+++ b/clang/test/OpenMP/task_if_codegen.cpp
@@ -145,7 +145,7 @@ int main() {
// CHECK1-NEXT: store i8* [[TMP9]], i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !12
// CHECK1-NEXT: store %struct.anon* [[TMP8]], %struct.anon** [[__CONTEXT_ADDR_I]], align 8, !noalias !12
// CHECK1-NEXT: [[TMP10:%.*]] = load %struct.anon*, %struct.anon** [[__CONTEXT_ADDR_I]], align 8, !noalias !12
-// CHECK1-NEXT: call void @_Z9gtid_testv() #[[ATTR3]]
+// CHECK1-NEXT: call void @_Z9gtid_testv()
// CHECK1-NEXT: ret i32 0
//
//
diff --git a/clang/test/OpenMP/task_in_reduction_codegen.cpp b/clang/test/OpenMP/task_in_reduction_codegen.cpp
index 3eb4696acc259..577d23d85d3d8 100644
--- a/clang/test/OpenMP/task_in_reduction_codegen.cpp
+++ b/clang/test/OpenMP/task_in_reduction_codegen.cpp
@@ -612,18 +612,18 @@ int main(int argc, char **argv) {
// CHECK1-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP18]], align 8
// CHECK1-NEXT: [[TMP23:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !12
// CHECK1-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP21]] to i8*
-// CHECK1-NEXT: [[TMP25:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP23]], i8* [[TMP22]], i8* [[TMP24]]) #[[ATTR3]]
+// CHECK1-NEXT: [[TMP25:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP23]], i8* [[TMP22]], i8* [[TMP24]])
// CHECK1-NEXT: [[CONV_I:%.*]] = bitcast i8* [[TMP25]] to i32*
// CHECK1-NEXT: [[TMP26:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 2
// CHECK1-NEXT: [[TMP27:%.*]] = load i16*, i16** [[TMP26]], align 8
// CHECK1-NEXT: [[TMP28:%.*]] = mul nuw i64 [[TMP14]], 2
// CHECK1-NEXT: [[TMP29:%.*]] = udiv exact i64 [[TMP28]], ptrtoint (i16* getelementptr (i16, i16* null, i32 1) to i64)
-// CHECK1-NEXT: [[TMP30:%.*]] = call i8* @__kmpc_threadprivate_cached(%struct.ident_t* @[[GLOB1]], i32 [[TMP23]], i8* bitcast (i64* @{{reduction_size[.].+[.]}}) #[[ATTR3]]
+// CHECK1-NEXT: [[TMP30:%.*]] = call i8* @__kmpc_threadprivate_cached(%struct.ident_t* @[[GLOB1]], i32 [[TMP23]], i8* bitcast (i64* @{{reduction_size[.].+[.]}})
// CHECK1-NEXT: [[TMP31:%.*]] = bitcast i8* [[TMP30]] to i64*
// CHECK1-NEXT: store i64 [[TMP29]], i64* [[TMP31]], align 8
// CHECK1-NEXT: [[TMP32:%.*]] = load i8*, i8** [[TMP19]], align 8
// CHECK1-NEXT: [[TMP33:%.*]] = bitcast i16* [[TMP27]] to i8*
-// CHECK1-NEXT: [[TMP34:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP23]], i8* [[TMP32]], i8* [[TMP33]]) #[[ATTR3]]
+// CHECK1-NEXT: [[TMP34:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP23]], i8* [[TMP32]], i8* [[TMP33]])
// CHECK1-NEXT: [[CONV2_I:%.*]] = bitcast i8* [[TMP34]] to i16*
// CHECK1-NEXT: [[TMP35:%.*]] = load i32, i32* [[CONV_I]], align 4
// CHECK1-NEXT: [[IDXPROM_I:%.*]] = sext i32 [[TMP35]] to i64
@@ -672,7 +672,7 @@ int main(int argc, char **argv) {
// CHECK1-NEXT: [[TMP12:%.*]] = load i32*, i32** [[TMP11]], align 8
// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !24
// CHECK1-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP12]] to i8*
-// CHECK1-NEXT: [[TMP15:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP13]], i8* null, i8* [[TMP14]]) #[[ATTR3]]
+// CHECK1-NEXT: [[TMP15:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP13]], i8* null, i8* [[TMP14]])
// CHECK1-NEXT: [[CONV_I:%.*]] = bitcast i8* [[TMP15]] to i32*
// CHECK1-NEXT: [[TMP16:%.*]] = load i32, i32* [[CONV_I]], align 4
// CHECK1-NEXT: [[INC_I:%.*]] = add nsw i32 [[TMP16]], 1
diff --git a/clang/test/OpenMP/taskloop_in_reduction_codegen.cpp b/clang/test/OpenMP/taskloop_in_reduction_codegen.cpp
index 1a9b5afa1795d..25bfb6463f5db 100644
--- a/clang/test/OpenMP/taskloop_in_reduction_codegen.cpp
+++ b/clang/test/OpenMP/taskloop_in_reduction_codegen.cpp
@@ -624,18 +624,18 @@ int main(int argc, char **argv) {
// CHECK1-NEXT: [[TMP32:%.*]] = load i8*, i8** [[TMP28]], align 8
// CHECK1-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !14
// CHECK1-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP31]] to i8*
-// CHECK1-NEXT: [[TMP35:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP33]], i8* [[TMP32]], i8* [[TMP34]]) #[[ATTR3]]
+// CHECK1-NEXT: [[TMP35:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP33]], i8* [[TMP32]], i8* [[TMP34]])
// CHECK1-NEXT: [[CONV_I:%.*]] = bitcast i8* [[TMP35]] to i32*
// CHECK1-NEXT: [[TMP36:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP22]], i32 0, i32 2
// CHECK1-NEXT: [[TMP37:%.*]] = load i16*, i16** [[TMP36]], align 8
// CHECK1-NEXT: [[TMP38:%.*]] = mul nuw i64 [[TMP24]], 2
// CHECK1-NEXT: [[TMP39:%.*]] = udiv exact i64 [[TMP38]], ptrtoint (i16* getelementptr (i16, i16* null, i32 1) to i64)
-// CHECK1-NEXT: [[TMP40:%.*]] = call i8* @__kmpc_threadprivate_cached(%struct.ident_t* @[[GLOB1]], i32 [[TMP33]], i8* bitcast (i64* @{{reduction_size[.].+[.]}}) #[[ATTR3]]
+// CHECK1-NEXT: [[TMP40:%.*]] = call i8* @__kmpc_threadprivate_cached(%struct.ident_t* @[[GLOB1]], i32 [[TMP33]], i8* bitcast (i64* @{{reduction_size[.].+[.]}})
// CHECK1-NEXT: [[TMP41:%.*]] = bitcast i8* [[TMP40]] to i64*
// CHECK1-NEXT: store i64 [[TMP39]], i64* [[TMP41]], align 8
// CHECK1-NEXT: [[TMP42:%.*]] = load i8*, i8** [[TMP29]], align 8
// CHECK1-NEXT: [[TMP43:%.*]] = bitcast i16* [[TMP37]] to i8*
-// CHECK1-NEXT: [[TMP44:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP33]], i8* [[TMP42]], i8* [[TMP43]]) #[[ATTR3]]
+// CHECK1-NEXT: [[TMP44:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP33]], i8* [[TMP42]], i8* [[TMP43]])
// CHECK1-NEXT: [[CONV2_I:%.*]] = bitcast i8* [[TMP44]] to i16*
// CHECK1-NEXT: [[TMP45:%.*]] = load i64, i64* [[DOTLB__ADDR_I]], align 8, !noalias !14
// CHECK1-NEXT: [[CONV3_I:%.*]] = trunc i64 [[TMP45]] to i32
diff --git a/clang/test/OpenMP/taskloop_simd_in_reduction_codegen.cpp b/clang/test/OpenMP/taskloop_simd_in_reduction_codegen.cpp
index 7dc0fc9774ae2..a46f757a90ffb 100644
--- a/clang/test/OpenMP/taskloop_simd_in_reduction_codegen.cpp
+++ b/clang/test/OpenMP/taskloop_simd_in_reduction_codegen.cpp
@@ -624,18 +624,18 @@ int main(int argc, char **argv) {
// CHECK1-NEXT: [[TMP32:%.*]] = load i8*, i8** [[TMP28]], align 8
// CHECK1-NEXT: [[TMP33:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !14
// CHECK1-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP31]] to i8*
-// CHECK1-NEXT: [[TMP35:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP33]], i8* [[TMP32]], i8* [[TMP34]]) #[[ATTR3]]
+// CHECK1-NEXT: [[TMP35:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP33]], i8* [[TMP32]], i8* [[TMP34]])
// CHECK1-NEXT: [[CONV_I:%.*]] = bitcast i8* [[TMP35]] to i32*
// CHECK1-NEXT: [[TMP36:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP22]], i32 0, i32 2
// CHECK1-NEXT: [[TMP37:%.*]] = load i16*, i16** [[TMP36]], align 8
// CHECK1-NEXT: [[TMP38:%.*]] = mul nuw i64 [[TMP24]], 2
// CHECK1-NEXT: [[TMP39:%.*]] = udiv exact i64 [[TMP38]], ptrtoint (i16* getelementptr (i16, i16* null, i32 1) to i64)
-// CHECK1-NEXT: [[TMP40:%.*]] = call i8* @__kmpc_threadprivate_cached(%struct.ident_t* @[[GLOB1]], i32 [[TMP33]], i8* bitcast (i64* @{{reduction_size[.].+[.]}}) #[[ATTR3]]
+// CHECK1-NEXT: [[TMP40:%.*]] = call i8* @__kmpc_threadprivate_cached(%struct.ident_t* @[[GLOB1]], i32 [[TMP33]], i8* bitcast (i64* @{{reduction_size[.].+[.]}})
// CHECK1-NEXT: [[TMP41:%.*]] = bitcast i8* [[TMP40]] to i64*
// CHECK1-NEXT: store i64 [[TMP39]], i64* [[TMP41]], align 8
// CHECK1-NEXT: [[TMP42:%.*]] = load i8*, i8** [[TMP29]], align 8
// CHECK1-NEXT: [[TMP43:%.*]] = bitcast i16* [[TMP37]] to i8*
-// CHECK1-NEXT: [[TMP44:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP33]], i8* [[TMP42]], i8* [[TMP43]]) #[[ATTR3]]
+// CHECK1-NEXT: [[TMP44:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP33]], i8* [[TMP42]], i8* [[TMP43]])
// CHECK1-NEXT: [[CONV2_I:%.*]] = bitcast i8* [[TMP44]] to i16*
// CHECK1-NEXT: [[TMP45:%.*]] = load i64, i64* [[DOTLB__ADDR_I]], align 8, !noalias !14
// CHECK1-NEXT: [[CONV3_I:%.*]] = trunc i64 [[TMP45]] to i32
diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_reduction_task_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_reduction_task_codegen.cpp
index d26ba1d63f1fe..9e19165ac9a39 100644
--- a/clang/test/OpenMP/teams_distribute_parallel_for_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_reduction_task_codegen.cpp
@@ -838,7 +838,7 @@ int main(int argc, char **argv) {
// CHECK1-NEXT: [[TMP19:%.*]] = load i8*, i8** [[TMP16]], align 8
// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !12
// CHECK1-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP18]] to i8*
-// CHECK1-NEXT: [[TMP22:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP19]], i8* [[TMP21]]) #[[ATTR5]]
+// CHECK1-NEXT: [[TMP22:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP19]], i8* [[TMP21]])
// CHECK1-NEXT: [[CONV_I:%.*]] = bitcast i8* [[TMP22]] to i32*
// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 2
// CHECK1-NEXT: [[TMP24:%.*]] = load i8**, i8*** [[TMP23]], align 8
@@ -861,7 +861,7 @@ int main(int argc, char **argv) {
// CHECK1-NEXT: [[TMP38:%.*]] = mul nuw i64 [[TMP37]], ptrtoint (i8* getelementptr (i8, i8* null, i32 1) to i64)
// CHECK1-NEXT: store i64 [[TMP37]], i64* @{{reduction_size[.].+[.]}}, align 8, !noalias !12
// CHECK1-NEXT: [[TMP39:%.*]] = load i8*, i8** [[TMP16]], align 8
-// CHECK1-NEXT: [[TMP40:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP39]], i8* [[TMP25]]) #[[ATTR5]]
+// CHECK1-NEXT: [[TMP40:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP39]], i8* [[TMP25]])
// CHECK1-NEXT: [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 2
// CHECK1-NEXT: [[TMP42:%.*]] = load i8**, i8*** [[TMP41]], align 8
// CHECK1-NEXT: [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 8
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index 2fb00f95b7495..7e835bd863ec4 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -2194,9 +2194,11 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
CI->setTailCallKind(ChildTCK);
InlinedMustTailCalls |= CI->isMustTailCall();
- // Calls inlined through a 'nounwind' call site should be marked
- // 'nounwind'.
- if (MarkNoUnwind)
+ // Call sites inlined through a 'nounwind' call site should be
+ // 'nounwind' as well. However, avoid marking call sites explicitly
+ // where possible. This helps expose more opportunities for CSE after
+ // inlining, commonly when the callee is an intrinsic.
+ if (MarkNoUnwind && !CI->doesNotThrow())
CI->setDoesNotThrow();
}
}
diff --git a/llvm/test/Transforms/Coroutines/coro-retcon-once-value.ll b/llvm/test/Transforms/Coroutines/coro-retcon-once-value.ll
index b780d7ed5a44b..bd7f3aba5b92a 100644
--- a/llvm/test/Transforms/Coroutines/coro-retcon-once-value.ll
+++ b/llvm/test/Transforms/Coroutines/coro-retcon-once-value.ll
@@ -53,7 +53,7 @@ define void @test(i32* %array) {
; CHECK-NEXT: store i32* [[ARRAY:%.*]], i32** [[TMP0]], align 8
; CHECK-NEXT: [[LOAD_I:%.*]] = load i32, i32* [[ARRAY]], align 4
; CHECK-NEXT: [[LOAD_POS_I:%.*]] = icmp sgt i32 [[LOAD_I]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.smax.i32(i32 [[LOAD_I]], i32 0) #[[ATTR0:[0-9]+]]
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.smax.i32(i32 [[LOAD_I]], i32 0)
; CHECK-NEXT: call void @print(i32 [[TMP1]])
; CHECK-NEXT: [[CONT_CAST:%.*]] = select i1 [[LOAD_POS_I]], void (i8*, i1)* @f.resume.0, void (i8*, i1)* @f.resume.1
; CHECK-NEXT: call void [[CONT_CAST]](i8* nonnull [[DOTSUB]], i1 zeroext false)
diff --git a/llvm/test/Transforms/Inline/noalias-calls-always.ll b/llvm/test/Transforms/Inline/noalias-calls-always.ll
index 60c562fa1ae32..a26e7c0d76d93 100644
--- a/llvm/test/Transforms/Inline/noalias-calls-always.ll
+++ b/llvm/test/Transforms/Inline/noalias-calls-always.ll
@@ -34,11 +34,11 @@ define void @foo(i8* nocapture %a, i8* nocapture readonly %c, i8* nocapture %b)
; CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META0:![0-9]+]])
; CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META3:![0-9]+]])
; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 512, i8* [[L_I]])
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[A:%.*]], i8* align 16 [[B:%.*]], i64 16, i1 false) #[[ATTR6:[0-9]+]], !noalias !3
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[B]], i8* align 16 [[C:%.*]], i64 16, i1 false) #[[ATTR6]], !noalias !0
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[A]], i8* align 16 [[C]], i64 16, i1 false) #[[ATTR6]], !alias.scope !5
-; CHECK-NEXT: call void @hey() #[[ATTR6]], !noalias !5
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[L_I]], i8* align 16 [[C]], i64 16, i1 false) #[[ATTR6]], !noalias !0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[A:%.*]], i8* align 16 [[B:%.*]], i64 16, i1 false), !noalias !3
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[B]], i8* align 16 [[C:%.*]], i64 16, i1 false), !noalias !0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[A]], i8* align 16 [[C]], i64 16, i1 false), !alias.scope !5
+; CHECK-NEXT: call void @hey(), !noalias !5
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[L_I]], i8* align 16 [[C]], i64 16, i1 false), !noalias !0
; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 512, i8* [[L_I]])
; CHECK-NEXT: ret void
;
@@ -75,11 +75,11 @@ define void @foo_cs(i8* nocapture %a, i8* nocapture readonly %c, i8* nocapture %
; CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META6:![0-9]+]])
; CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META9:![0-9]+]])
; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 512, i8* [[L_I]])
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[A:%.*]], i8* align 16 [[B:%.*]], i64 16, i1 false) #[[ATTR6]], !noalias !9
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[B]], i8* align 16 [[C:%.*]], i64 16, i1 false) #[[ATTR6]], !noalias !6
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[A]], i8* align 16 [[C]], i64 16, i1 false) #[[ATTR6]], !alias.scope !11
-; CHECK-NEXT: call void @hey() #[[ATTR6]], !noalias !11
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[L_I]], i8* align 16 [[C]], i64 16, i1 false) #[[ATTR6]], !noalias !6
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[A:%.*]], i8* align 16 [[B:%.*]], i64 16, i1 false), !noalias !9
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[B]], i8* align 16 [[C:%.*]], i64 16, i1 false), !noalias !6
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[A]], i8* align 16 [[C]], i64 16, i1 false), !alias.scope !11
+; CHECK-NEXT: call void @hey(), !noalias !11
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[L_I]], i8* align 16 [[C]], i64 16, i1 false), !noalias !6
; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 512, i8* [[L_I]])
; CHECK-NEXT: ret void
;
diff --git a/llvm/test/Transforms/Inline/noalias-calls.ll b/llvm/test/Transforms/Inline/noalias-calls.ll
index b11aa500ba814..02daffe13584c 100644
--- a/llvm/test/Transforms/Inline/noalias-calls.ll
+++ b/llvm/test/Transforms/Inline/noalias-calls.ll
@@ -37,11 +37,11 @@ define void @foo(i8* nocapture %a, i8* nocapture readonly %c, i8* nocapture %b)
; CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META0:![0-9]+]])
; CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META3:![0-9]+]])
; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 512, i8* [[L_I]])
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[A]], i8* align 16 [[B]], i64 16, i1 false) #[[ATTR3]], !noalias !3
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[B]], i8* align 16 [[C]], i64 16, i1 false) #[[ATTR3]], !noalias !0
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[A]], i8* align 16 [[C]], i64 16, i1 false) #[[ATTR3]], !alias.scope !5
-; CHECK-NEXT: call void @hey() #[[ATTR3]], !noalias !5
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[L_I]], i8* align 16 [[C]], i64 16, i1 false) #[[ATTR3]], !noalias !0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[A]], i8* align 16 [[B]], i64 16, i1 false), !noalias !3
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[B]], i8* align 16 [[C]], i64 16, i1 false), !noalias !0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[A]], i8* align 16 [[C]], i64 16, i1 false), !alias.scope !5
+; CHECK-NEXT: call void @hey(), !noalias !5
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[L_I]], i8* align 16 [[C]], i64 16, i1 false), !noalias !0
; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 512, i8* [[L_I]])
; CHECK-NEXT: ret void
;
@@ -80,11 +80,11 @@ define void @foo_cs(i8* nocapture %a, i8* nocapture readonly %c, i8* nocapture %
; CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META6:![0-9]+]])
; CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META9:![0-9]+]])
; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 512, i8* [[L_I]])
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[A]], i8* align 16 [[B]], i64 16, i1 false) #[[ATTR3]], !noalias !9
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[B]], i8* align 16 [[C]], i64 16, i1 false) #[[ATTR3]], !noalias !6
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[A]], i8* align 16 [[C]], i64 16, i1 false) #[[ATTR3]], !alias.scope !11
-; CHECK-NEXT: call void @hey() #[[ATTR3]], !noalias !11
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[L_I]], i8* align 16 [[C]], i64 16, i1 false) #[[ATTR3]], !noalias !6
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[A]], i8* align 16 [[B]], i64 16, i1 false), !noalias !9
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[B]], i8* align 16 [[C]], i64 16, i1 false), !noalias !6
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[A]], i8* align 16 [[C]], i64 16, i1 false), !alias.scope !11
+; CHECK-NEXT: call void @hey(), !noalias !11
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[L_I]], i8* align 16 [[C]], i64 16, i1 false), !noalias !6
; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 512, i8* [[L_I]])
; CHECK-NEXT: ret void
;
diff --git a/llvm/test/Transforms/Inline/noalias-cs.ll b/llvm/test/Transforms/Inline/noalias-cs.ll
index fd310a8f6b760..edc0b8755a0cd 100644
--- a/llvm/test/Transforms/Inline/noalias-cs.ll
+++ b/llvm/test/Transforms/Inline/noalias-cs.ll
@@ -65,8 +65,8 @@ define void @caller(float* nocapture %a, float* nocapture %b, float** nocapture
; CHECK-LABEL: @caller(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[C:%.*]] = load float*, float** [[C_PTR:%.*]], align 8, !alias.scope !6
-; CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META9:![0-9]+]]) #[[ATTR2:[0-9]+]], !noalias !6
-; CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META12:![0-9]+]]) #[[ATTR2]], !noalias !6
+; CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META9:![0-9]+]]), !noalias !6
+; CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META12:![0-9]+]]), !noalias !6
; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[C]], align 4, !noalias !14
; CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 5
; CHECK-NEXT: store float [[TMP0]], float* [[ARRAYIDX_I_I]], align 4, !alias.scope !9, !noalias !15
@@ -75,8 +75,8 @@ define void @caller(float* nocapture %a, float* nocapture %b, float** nocapture
; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[C]], align 4, !noalias !6
; CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds float, float* [[A]], i64 7
; CHECK-NEXT: store float [[TMP1]], float* [[ARRAYIDX_I]], align 4, !noalias !6
-; CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META17:![0-9]+]]) #[[ATTR2]], !alias.scope !6
-; CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META20:![0-9]+]]) #[[ATTR2]], !alias.scope !6
+; CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META17:![0-9]+]]), !alias.scope !6
+; CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META20:![0-9]+]]), !alias.scope !6
; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[A]], align 4, !alias.scope !6, !noalias !22
; CHECK-NEXT: [[ARRAYIDX_I_I1:%.*]] = getelementptr inbounds float, float* [[B]], i64 5
; CHECK-NEXT: store float [[TMP2]], float* [[ARRAYIDX_I_I1]], align 4, !alias.scope !23, !noalias !20
diff --git a/llvm/test/Transforms/Inline/noalias2.ll b/llvm/test/Transforms/Inline/noalias2.ll
index 8250d1bfc53c1..eb280a14b5073 100644
--- a/llvm/test/Transforms/Inline/noalias2.ll
+++ b/llvm/test/Transforms/Inline/noalias2.ll
@@ -71,8 +71,8 @@ define void @foo2(float* nocapture %a, float* nocapture %b, float* nocapture rea
; CHECK-NEXT: entry:
; CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META5:![0-9]+]])
; CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META8:![0-9]+]])
-; CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META10:![0-9]+]]) #[[ATTR2:[0-9]+]]
-; CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META13:![0-9]+]]) #[[ATTR2]]
+; CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META10:![0-9]+]])
+; CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META13:![0-9]+]])
; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[C]], align 4, !alias.scope !15, !noalias !16
; CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds float, float* [[A]], i64 5
; CHECK-NEXT: store float [[TMP0]], float* [[ARRAYIDX_I_I]], align 4, !alias.scope !16, !noalias !15
diff --git a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
index 5d59a293d29f1..d8b950e1e76ed 100644
--- a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
+++ b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
@@ -65,7 +65,7 @@ define void @arm_mult_q15(i16* %pSrcA, i16* %pSrcB, i16 * noalias %pDst, i32 %bl
; CHECK-NEXT: [[CONV2:%.*]] = sext i16 [[TMP11]] to i32
; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[CONV2]], [[CONV]]
; CHECK-NEXT: [[SHR:%.*]] = ashr i32 [[MUL]], 15
-; CHECK-NEXT: [[TMP12:%.*]] = tail call i32 @llvm.smin.i32(i32 [[SHR]], i32 32767) #[[ATTR2:[0-9]+]]
+; CHECK-NEXT: [[TMP12:%.*]] = tail call i32 @llvm.smin.i32(i32 [[SHR]], i32 32767)
; CHECK-NEXT: [[CONV3:%.*]] = trunc i32 [[TMP12]] to i16
; CHECK-NEXT: [[INCDEC_PTR4]] = getelementptr inbounds i16, i16* [[PDST_ADDR_05]], i32 1
; CHECK-NEXT: store i16 [[CONV3]], i16* [[PDST_ADDR_05]], align 2
diff --git a/llvm/test/Transforms/PhaseOrdering/deletion-of-loops-that-became-side-effect-free.ll b/llvm/test/Transforms/PhaseOrdering/deletion-of-loops-that-became-side-effect-free.ll
index 99a52acd3b2b1..7df88d111008c 100644
--- a/llvm/test/Transforms/PhaseOrdering/deletion-of-loops-that-became-side-effect-free.ll
+++ b/llvm/test/Transforms/PhaseOrdering/deletion-of-loops-that-became-side-effect-free.ll
@@ -120,7 +120,7 @@ define dso_local zeroext i1 @is_not_empty_variant3(%struct.node* %p) {
; O1-NEXT: [[SIZE_06_I:%.*]] = phi i64 [ [[INC_I:%.*]], [[WHILE_BODY_I]] ], [ 0, [[ENTRY:%.*]] ]
; O1-NEXT: [[P_ADDR_05_I:%.*]] = phi %struct.node* [ [[TMP0:%.*]], [[WHILE_BODY_I]] ], [ [[P]], [[ENTRY]] ]
; O1-NEXT: [[CMP_I:%.*]] = icmp ne i64 [[SIZE_06_I]], -1
-; O1-NEXT: call void @llvm.assume(i1 [[CMP_I]]) #[[ATTR3:[0-9]+]]
+; O1-NEXT: call void @llvm.assume(i1 [[CMP_I]])
; O1-NEXT: [[NEXT_I:%.*]] = getelementptr inbounds [[STRUCT_NODE:%.*]], %struct.node* [[P_ADDR_05_I]], i64 0, i32 0
; O1-NEXT: [[TMP0]] = load %struct.node*, %struct.node** [[NEXT_I]], align 8
; O1-NEXT: [[INC_I]] = add i64 [[SIZE_06_I]], 1
More information about the llvm-commits
mailing list