[clang] 7af2818 - Update some more tests with update_cc_test_checks.py

Nicolai Hähnle via cfe-commits cfe-commits at lists.llvm.org
Wed Jul 20 04:27:31 PDT 2022


Author: Nicolai Hähnle
Date: 2022-07-20T13:27:18+02:00
New Revision: 7af2818a995efe84aa0bffa6ababe57ec553ef81

URL: https://github.com/llvm/llvm-project/commit/7af2818a995efe84aa0bffa6ababe57ec553ef81
DIFF: https://github.com/llvm/llvm-project/commit/7af2818a995efe84aa0bffa6ababe57ec553ef81.diff

LOG: Update some more tests with update_cc_test_checks.py

Added: 
    

Modified: 
    clang/test/CodeGen/aarch64-ls64.c
    clang/test/CodeGen/aarch64-neon-across.c
    clang/test/CodeGen/aarch64-neon-fcvt-intrinsics.c
    clang/test/CodeGen/aarch64-neon-fma.c
    clang/test/CodeGen/aarch64-neon-tbl.c
    clang/test/CodeGen/aarch64-poly128.c
    clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
    clang/test/CodeGen/arm-neon-fma.c
    clang/test/CodeGen/arm-neon-numeric-maxmin.c
    clang/test/CodeGen/arm-neon-vcvtX.c

Removed: 
    


################################################################################
diff  --git a/clang/test/CodeGen/aarch64-ls64.c b/clang/test/CodeGen/aarch64-ls64.c
index f7fd5b19c18a7..dcf8af7438f37 100644
--- a/clang/test/CodeGen/aarch64-ls64.c
+++ b/clang/test/CodeGen/aarch64-ls64.c
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
-// RUN: %clang_cc1 -no-opaque-pointers -triple aarch64-eabi -target-feature +ls64 -S -emit-llvm -x c %s -o - | FileCheck %s
-// RUN: %clang_cc1 -no-opaque-pointers -triple aarch64-eabi -target-feature +ls64 -S -emit-llvm -x c++ %s -o - | FileCheck %s
-// RUN: %clang_cc1 -no-opaque-pointers -triple aarch64_be-eabi -target-feature +ls64 -S -emit-llvm -x c %s -o - | FileCheck %s
-// RUN: %clang_cc1 -no-opaque-pointers -triple aarch64_be-eabi -target-feature +ls64 -S -emit-llvm -x c++ %s -o - | FileCheck %s
+// RUN: %clang_cc1 -no-opaque-pointers -triple aarch64-eabi -target-feature +ls64 -S -emit-llvm -x c %s -o - | FileCheck --check-prefixes=CHECK-C %s
+// RUN: %clang_cc1 -no-opaque-pointers -triple aarch64-eabi -target-feature +ls64 -S -emit-llvm -x c++ %s -o - | FileCheck --check-prefixes=CHECK-CXX %s
+// RUN: %clang_cc1 -no-opaque-pointers -triple aarch64_be-eabi -target-feature +ls64 -S -emit-llvm -x c %s -o - | FileCheck  --check-prefixes=CHECK-C %s
+// RUN: %clang_cc1 -no-opaque-pointers -triple aarch64_be-eabi -target-feature +ls64 -S -emit-llvm -x c++ %s -o - | FileCheck  --check-prefixes=CHECK-CXX %s
 
 #include <arm_acle.h>
 
@@ -16,147 +16,274 @@ data512_t val;
 void *addr;
 uint64_t status;
 
-// CHECK-LABEL: @test_ld64b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__ADDR_ADDR_I:%.*]] = alloca i8*, align 8
-// CHECK-NEXT:    [[REF_TMP:%.*]] = alloca [[STRUCT_DATA512_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load i8*, i8** @addr, align 8
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata !6)
-// CHECK-NEXT:    store i8* [[TMP0]], i8** [[__ADDR_ADDR_I]], align 8, !noalias !6
-// CHECK-NEXT:    [[TMP1:%.*]] = load i8*, i8** [[__ADDR_ADDR_I]], align 8, !noalias !6
-// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_DATA512_T]], %struct.data512_t* [[REF_TMP]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYDECAY_I:%.*]] = getelementptr inbounds [8 x i64], [8 x i64]* [[VAL_I]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP2:%.*]] = call { i64, i64, i64, i64, i64, i64, i64, i64 } @llvm.aarch64.ld64b(i8* [[TMP1]]) [[ATTR2:#.*]], !noalias !6
-// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } [[TMP2]], 0
-// CHECK-NEXT:    store i64 [[TMP3]], i64* [[ARRAYDECAY_I]], align 8, !alias.scope !6
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 1
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } [[TMP2]], 1
-// CHECK-NEXT:    store i64 [[TMP5]], i64* [[TMP4]], align 8, !alias.scope !6
-// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 2
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } [[TMP2]], 2
-// CHECK-NEXT:    store i64 [[TMP7]], i64* [[TMP6]], align 8, !alias.scope !6
-// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 3
-// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } [[TMP2]], 3
-// CHECK-NEXT:    store i64 [[TMP9]], i64* [[TMP8]], align 8, !alias.scope !6
-// CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 4
-// CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } [[TMP2]], 4
-// CHECK-NEXT:    store i64 [[TMP11]], i64* [[TMP10]], align 8, !alias.scope !6
-// CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 5
-// CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } [[TMP2]], 5
-// CHECK-NEXT:    store i64 [[TMP13]], i64* [[TMP12]], align 8, !alias.scope !6
-// CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 6
-// CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } [[TMP2]], 6
-// CHECK-NEXT:    store i64 [[TMP15]], i64* [[TMP14]], align 8, !alias.scope !6
-// CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 7
-// CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } [[TMP2]], 7
-// CHECK-NEXT:    store i64 [[TMP17]], i64* [[TMP16]], align 8, !alias.scope !6
-// CHECK-NEXT:    [[TMP18:%.*]] = bitcast %struct.data512_t* [[REF_TMP]] to i8*
-// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 bitcast (%struct.data512_t* @val to i8*), i8* align 8 [[TMP18]], i64 64, i1 false)
-// CHECK-NEXT:    ret void
+// CHECK-C-LABEL: @test_ld64b(
+// CHECK-C-NEXT:  entry:
+// CHECK-C-NEXT:    [[__ADDR_ADDR_I:%.*]] = alloca i8*, align 8
+// CHECK-C-NEXT:    [[TMP:%.*]] = alloca [[STRUCT_DATA512_T:%.*]], align 8
+// CHECK-C-NEXT:    [[TMP0:%.*]] = load i8*, i8** @addr, align 8
+// CHECK-C-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META6:![0-9]+]])
+// CHECK-C-NEXT:    store i8* [[TMP0]], i8** [[__ADDR_ADDR_I]], align 8, !noalias !6
+// CHECK-C-NEXT:    [[TMP1:%.*]] = load i8*, i8** [[__ADDR_ADDR_I]], align 8, !noalias !6
+// CHECK-C-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_DATA512_T]], %struct.data512_t* [[TMP]], i32 0, i32 0
+// CHECK-C-NEXT:    [[ARRAYDECAY_I:%.*]] = getelementptr inbounds [8 x i64], [8 x i64]* [[VAL_I]], i64 0, i64 0
+// CHECK-C-NEXT:    [[TMP2:%.*]] = call { i64, i64, i64, i64, i64, i64, i64, i64 } @llvm.aarch64.ld64b(i8* [[TMP1]]) #[[ATTR2:[0-9]+]], !noalias !6
+// CHECK-C-NEXT:    [[TMP3:%.*]] = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } [[TMP2]], 0
+// CHECK-C-NEXT:    store i64 [[TMP3]], i64* [[ARRAYDECAY_I]], align 8, !alias.scope !6
+// CHECK-C-NEXT:    [[TMP4:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 1
+// CHECK-C-NEXT:    [[TMP5:%.*]] = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } [[TMP2]], 1
+// CHECK-C-NEXT:    store i64 [[TMP5]], i64* [[TMP4]], align 8, !alias.scope !6
+// CHECK-C-NEXT:    [[TMP6:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 2
+// CHECK-C-NEXT:    [[TMP7:%.*]] = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } [[TMP2]], 2
+// CHECK-C-NEXT:    store i64 [[TMP7]], i64* [[TMP6]], align 8, !alias.scope !6
+// CHECK-C-NEXT:    [[TMP8:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 3
+// CHECK-C-NEXT:    [[TMP9:%.*]] = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } [[TMP2]], 3
+// CHECK-C-NEXT:    store i64 [[TMP9]], i64* [[TMP8]], align 8, !alias.scope !6
+// CHECK-C-NEXT:    [[TMP10:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 4
+// CHECK-C-NEXT:    [[TMP11:%.*]] = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } [[TMP2]], 4
+// CHECK-C-NEXT:    store i64 [[TMP11]], i64* [[TMP10]], align 8, !alias.scope !6
+// CHECK-C-NEXT:    [[TMP12:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 5
+// CHECK-C-NEXT:    [[TMP13:%.*]] = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } [[TMP2]], 5
+// CHECK-C-NEXT:    store i64 [[TMP13]], i64* [[TMP12]], align 8, !alias.scope !6
+// CHECK-C-NEXT:    [[TMP14:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 6
+// CHECK-C-NEXT:    [[TMP15:%.*]] = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } [[TMP2]], 6
+// CHECK-C-NEXT:    store i64 [[TMP15]], i64* [[TMP14]], align 8, !alias.scope !6
+// CHECK-C-NEXT:    [[TMP16:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 7
+// CHECK-C-NEXT:    [[TMP17:%.*]] = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } [[TMP2]], 7
+// CHECK-C-NEXT:    store i64 [[TMP17]], i64* [[TMP16]], align 8, !alias.scope !6
+// CHECK-C-NEXT:    [[TMP18:%.*]] = bitcast %struct.data512_t* [[TMP]] to i8*
+// CHECK-C-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 bitcast (%struct.data512_t* @val to i8*), i8* align 8 [[TMP18]], i64 64, i1 false)
+// CHECK-C-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: @test_ld64b(
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[__ADDR_ADDR_I:%.*]] = alloca i8*, align 8
+// CHECK-CXX-NEXT:    [[REF_TMP:%.*]] = alloca [[STRUCT_DATA512_T:%.*]], align 8
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = load i8*, i8** @addr, align 8
+// CHECK-CXX-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META6:![0-9]+]])
+// CHECK-CXX-NEXT:    store i8* [[TMP0]], i8** [[__ADDR_ADDR_I]], align 8, !noalias !6
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = load i8*, i8** [[__ADDR_ADDR_I]], align 8, !noalias !6
+// CHECK-CXX-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_DATA512_T]], %struct.data512_t* [[REF_TMP]], i32 0, i32 0
+// CHECK-CXX-NEXT:    [[ARRAYDECAY_I:%.*]] = getelementptr inbounds [8 x i64], [8 x i64]* [[VAL_I]], i64 0, i64 0
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = call { i64, i64, i64, i64, i64, i64, i64, i64 } @llvm.aarch64.ld64b(i8* [[TMP1]]) #[[ATTR2:[0-9]+]], !noalias !6
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } [[TMP2]], 0
+// CHECK-CXX-NEXT:    store i64 [[TMP3]], i64* [[ARRAYDECAY_I]], align 8, !alias.scope !6
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 1
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } [[TMP2]], 1
+// CHECK-CXX-NEXT:    store i64 [[TMP5]], i64* [[TMP4]], align 8, !alias.scope !6
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 2
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } [[TMP2]], 2
+// CHECK-CXX-NEXT:    store i64 [[TMP7]], i64* [[TMP6]], align 8, !alias.scope !6
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 3
+// CHECK-CXX-NEXT:    [[TMP9:%.*]] = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } [[TMP2]], 3
+// CHECK-CXX-NEXT:    store i64 [[TMP9]], i64* [[TMP8]], align 8, !alias.scope !6
+// CHECK-CXX-NEXT:    [[TMP10:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 4
+// CHECK-CXX-NEXT:    [[TMP11:%.*]] = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } [[TMP2]], 4
+// CHECK-CXX-NEXT:    store i64 [[TMP11]], i64* [[TMP10]], align 8, !alias.scope !6
+// CHECK-CXX-NEXT:    [[TMP12:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 5
+// CHECK-CXX-NEXT:    [[TMP13:%.*]] = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } [[TMP2]], 5
+// CHECK-CXX-NEXT:    store i64 [[TMP13]], i64* [[TMP12]], align 8, !alias.scope !6
+// CHECK-CXX-NEXT:    [[TMP14:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 6
+// CHECK-CXX-NEXT:    [[TMP15:%.*]] = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } [[TMP2]], 6
+// CHECK-CXX-NEXT:    store i64 [[TMP15]], i64* [[TMP14]], align 8, !alias.scope !6
+// CHECK-CXX-NEXT:    [[TMP16:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 7
+// CHECK-CXX-NEXT:    [[TMP17:%.*]] = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } [[TMP2]], 7
+// CHECK-CXX-NEXT:    store i64 [[TMP17]], i64* [[TMP16]], align 8, !alias.scope !6
+// CHECK-CXX-NEXT:    [[TMP18:%.*]] = bitcast %struct.data512_t* [[REF_TMP]] to i8*
+// CHECK-CXX-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 bitcast (%struct.data512_t* @val to i8*), i8* align 8 [[TMP18]], i64 64, i1 false)
+// CHECK-CXX-NEXT:    ret void
 //
 EXTERN_C void test_ld64b(void)
 {
     val = __arm_ld64b(addr);
 }
 
-// CHECK-LABEL: @test_st64b(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__ADDR_ADDR_I:%.*]] = alloca i8*, align 8
-// CHECK-NEXT:    [[AGG_TMP:%.*]] = alloca [[STRUCT_DATA512_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load i8*, i8** @addr, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.data512_t* [[AGG_TMP]] to i8*
-// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP1]], i8* align 8 bitcast (%struct.data512_t* @val to i8*), i64 64, i1 false)
-// CHECK-NEXT:    store i8* [[TMP0]], i8** [[__ADDR_ADDR_I]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[__ADDR_ADDR_I]], align 8
-// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_DATA512_T]], %struct.data512_t* [[AGG_TMP]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYDECAY_I:%.*]] = getelementptr inbounds [8 x i64], [8 x i64]* [[VAL_I]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP3:%.*]] = load i64, i64* [[ARRAYDECAY_I]], align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 1
-// CHECK-NEXT:    [[TMP5:%.*]] = load i64, i64* [[TMP4]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 2
-// CHECK-NEXT:    [[TMP7:%.*]] = load i64, i64* [[TMP6]], align 8
-// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 3
-// CHECK-NEXT:    [[TMP9:%.*]] = load i64, i64* [[TMP8]], align 8
-// CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 4
-// CHECK-NEXT:    [[TMP11:%.*]] = load i64, i64* [[TMP10]], align 8
-// CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 5
-// CHECK-NEXT:    [[TMP13:%.*]] = load i64, i64* [[TMP12]], align 8
-// CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 6
-// CHECK-NEXT:    [[TMP15:%.*]] = load i64, i64* [[TMP14]], align 8
-// CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 7
-// CHECK-NEXT:    [[TMP17:%.*]] = load i64, i64* [[TMP16]], align 8
-// CHECK-NEXT:    call void @llvm.aarch64.st64b(i8* [[TMP2]], i64 [[TMP3]], i64 [[TMP5]], i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]], i64 [[TMP15]], i64 [[TMP17]]) [[ATTR2]]
-// CHECK-NEXT:    ret void
+// CHECK-C-LABEL: @test_st64b(
+// CHECK-C-NEXT:  entry:
+// CHECK-C-NEXT:    [[__ADDR_ADDR_I:%.*]] = alloca i8*, align 8
+// CHECK-C-NEXT:    [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_DATA512_T:%.*]], align 8
+// CHECK-C-NEXT:    [[TMP0:%.*]] = load i8*, i8** @addr, align 8
+// CHECK-C-NEXT:    [[TMP1:%.*]] = bitcast %struct.data512_t* [[BYVAL_TEMP]] to i8*
+// CHECK-C-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP1]], i8* align 8 bitcast (%struct.data512_t* @val to i8*), i64 64, i1 false)
+// CHECK-C-NEXT:    store i8* [[TMP0]], i8** [[__ADDR_ADDR_I]], align 8
+// CHECK-C-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[__ADDR_ADDR_I]], align 8
+// CHECK-C-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_DATA512_T]], %struct.data512_t* [[BYVAL_TEMP]], i32 0, i32 0
+// CHECK-C-NEXT:    [[ARRAYDECAY_I:%.*]] = getelementptr inbounds [8 x i64], [8 x i64]* [[VAL_I]], i64 0, i64 0
+// CHECK-C-NEXT:    [[TMP3:%.*]] = load i64, i64* [[ARRAYDECAY_I]], align 8
+// CHECK-C-NEXT:    [[TMP4:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 1
+// CHECK-C-NEXT:    [[TMP5:%.*]] = load i64, i64* [[TMP4]], align 8
+// CHECK-C-NEXT:    [[TMP6:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 2
+// CHECK-C-NEXT:    [[TMP7:%.*]] = load i64, i64* [[TMP6]], align 8
+// CHECK-C-NEXT:    [[TMP8:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 3
+// CHECK-C-NEXT:    [[TMP9:%.*]] = load i64, i64* [[TMP8]], align 8
+// CHECK-C-NEXT:    [[TMP10:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 4
+// CHECK-C-NEXT:    [[TMP11:%.*]] = load i64, i64* [[TMP10]], align 8
+// CHECK-C-NEXT:    [[TMP12:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 5
+// CHECK-C-NEXT:    [[TMP13:%.*]] = load i64, i64* [[TMP12]], align 8
+// CHECK-C-NEXT:    [[TMP14:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 6
+// CHECK-C-NEXT:    [[TMP15:%.*]] = load i64, i64* [[TMP14]], align 8
+// CHECK-C-NEXT:    [[TMP16:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 7
+// CHECK-C-NEXT:    [[TMP17:%.*]] = load i64, i64* [[TMP16]], align 8
+// CHECK-C-NEXT:    call void @llvm.aarch64.st64b(i8* [[TMP2]], i64 [[TMP3]], i64 [[TMP5]], i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]], i64 [[TMP15]], i64 [[TMP17]]) #[[ATTR2]]
+// CHECK-C-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: @test_st64b(
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[__ADDR_ADDR_I:%.*]] = alloca i8*, align 8
+// CHECK-CXX-NEXT:    [[AGG_TMP:%.*]] = alloca [[STRUCT_DATA512_T:%.*]], align 8
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = load i8*, i8** @addr, align 8
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = bitcast %struct.data512_t* [[AGG_TMP]] to i8*
+// CHECK-CXX-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP1]], i8* align 8 bitcast (%struct.data512_t* @val to i8*), i64 64, i1 false)
+// CHECK-CXX-NEXT:    store i8* [[TMP0]], i8** [[__ADDR_ADDR_I]], align 8
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[__ADDR_ADDR_I]], align 8
+// CHECK-CXX-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_DATA512_T]], %struct.data512_t* [[AGG_TMP]], i32 0, i32 0
+// CHECK-CXX-NEXT:    [[ARRAYDECAY_I:%.*]] = getelementptr inbounds [8 x i64], [8 x i64]* [[VAL_I]], i64 0, i64 0
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = load i64, i64* [[ARRAYDECAY_I]], align 8
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 1
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = load i64, i64* [[TMP4]], align 8
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 2
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = load i64, i64* [[TMP6]], align 8
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 3
+// CHECK-CXX-NEXT:    [[TMP9:%.*]] = load i64, i64* [[TMP8]], align 8
+// CHECK-CXX-NEXT:    [[TMP10:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 4
+// CHECK-CXX-NEXT:    [[TMP11:%.*]] = load i64, i64* [[TMP10]], align 8
+// CHECK-CXX-NEXT:    [[TMP12:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 5
+// CHECK-CXX-NEXT:    [[TMP13:%.*]] = load i64, i64* [[TMP12]], align 8
+// CHECK-CXX-NEXT:    [[TMP14:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 6
+// CHECK-CXX-NEXT:    [[TMP15:%.*]] = load i64, i64* [[TMP14]], align 8
+// CHECK-CXX-NEXT:    [[TMP16:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 7
+// CHECK-CXX-NEXT:    [[TMP17:%.*]] = load i64, i64* [[TMP16]], align 8
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.st64b(i8* [[TMP2]], i64 [[TMP3]], i64 [[TMP5]], i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]], i64 [[TMP15]], i64 [[TMP17]]) #[[ATTR2]]
+// CHECK-CXX-NEXT:    ret void
 //
 EXTERN_C void test_st64b(void)
 {
     __arm_st64b(addr, val);
 }
 
-// CHECK-LABEL: @test_st64bv(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__ADDR_ADDR_I:%.*]] = alloca i8*, align 8
-// CHECK-NEXT:    [[AGG_TMP:%.*]] = alloca [[STRUCT_DATA512_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load i8*, i8** @addr, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.data512_t* [[AGG_TMP]] to i8*
-// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP1]], i8* align 8 bitcast (%struct.data512_t* @val to i8*), i64 64, i1 false)
-// CHECK-NEXT:    store i8* [[TMP0]], i8** [[__ADDR_ADDR_I]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[__ADDR_ADDR_I]], align 8
-// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_DATA512_T]], %struct.data512_t* [[AGG_TMP]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYDECAY_I:%.*]] = getelementptr inbounds [8 x i64], [8 x i64]* [[VAL_I]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP3:%.*]] = load i64, i64* [[ARRAYDECAY_I]], align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 1
-// CHECK-NEXT:    [[TMP5:%.*]] = load i64, i64* [[TMP4]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 2
-// CHECK-NEXT:    [[TMP7:%.*]] = load i64, i64* [[TMP6]], align 8
-// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 3
-// CHECK-NEXT:    [[TMP9:%.*]] = load i64, i64* [[TMP8]], align 8
-// CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 4
-// CHECK-NEXT:    [[TMP11:%.*]] = load i64, i64* [[TMP10]], align 8
-// CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 5
-// CHECK-NEXT:    [[TMP13:%.*]] = load i64, i64* [[TMP12]], align 8
-// CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 6
-// CHECK-NEXT:    [[TMP15:%.*]] = load i64, i64* [[TMP14]], align 8
-// CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 7
-// CHECK-NEXT:    [[TMP17:%.*]] = load i64, i64* [[TMP16]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = call i64 @llvm.aarch64.st64bv(i8* [[TMP2]], i64 [[TMP3]], i64 [[TMP5]], i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]], i64 [[TMP15]], i64 [[TMP17]]) [[ATTR2]]
-// CHECK-NEXT:    store i64 [[TMP18]], i64* @status, align 8
-// CHECK-NEXT:    ret void
+// CHECK-C-LABEL: @test_st64bv(
+// CHECK-C-NEXT:  entry:
+// CHECK-C-NEXT:    [[__ADDR_ADDR_I:%.*]] = alloca i8*, align 8
+// CHECK-C-NEXT:    [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_DATA512_T:%.*]], align 8
+// CHECK-C-NEXT:    [[TMP0:%.*]] = load i8*, i8** @addr, align 8
+// CHECK-C-NEXT:    [[TMP1:%.*]] = bitcast %struct.data512_t* [[BYVAL_TEMP]] to i8*
+// CHECK-C-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP1]], i8* align 8 bitcast (%struct.data512_t* @val to i8*), i64 64, i1 false)
+// CHECK-C-NEXT:    store i8* [[TMP0]], i8** [[__ADDR_ADDR_I]], align 8
+// CHECK-C-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[__ADDR_ADDR_I]], align 8
+// CHECK-C-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_DATA512_T]], %struct.data512_t* [[BYVAL_TEMP]], i32 0, i32 0
+// CHECK-C-NEXT:    [[ARRAYDECAY_I:%.*]] = getelementptr inbounds [8 x i64], [8 x i64]* [[VAL_I]], i64 0, i64 0
+// CHECK-C-NEXT:    [[TMP3:%.*]] = load i64, i64* [[ARRAYDECAY_I]], align 8
+// CHECK-C-NEXT:    [[TMP4:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 1
+// CHECK-C-NEXT:    [[TMP5:%.*]] = load i64, i64* [[TMP4]], align 8
+// CHECK-C-NEXT:    [[TMP6:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 2
+// CHECK-C-NEXT:    [[TMP7:%.*]] = load i64, i64* [[TMP6]], align 8
+// CHECK-C-NEXT:    [[TMP8:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 3
+// CHECK-C-NEXT:    [[TMP9:%.*]] = load i64, i64* [[TMP8]], align 8
+// CHECK-C-NEXT:    [[TMP10:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 4
+// CHECK-C-NEXT:    [[TMP11:%.*]] = load i64, i64* [[TMP10]], align 8
+// CHECK-C-NEXT:    [[TMP12:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 5
+// CHECK-C-NEXT:    [[TMP13:%.*]] = load i64, i64* [[TMP12]], align 8
+// CHECK-C-NEXT:    [[TMP14:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 6
+// CHECK-C-NEXT:    [[TMP15:%.*]] = load i64, i64* [[TMP14]], align 8
+// CHECK-C-NEXT:    [[TMP16:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 7
+// CHECK-C-NEXT:    [[TMP17:%.*]] = load i64, i64* [[TMP16]], align 8
+// CHECK-C-NEXT:    [[TMP18:%.*]] = call i64 @llvm.aarch64.st64bv(i8* [[TMP2]], i64 [[TMP3]], i64 [[TMP5]], i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]], i64 [[TMP15]], i64 [[TMP17]]) #[[ATTR2]]
+// CHECK-C-NEXT:    store i64 [[TMP18]], i64* @status, align 8
+// CHECK-C-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: @test_st64bv(
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[__ADDR_ADDR_I:%.*]] = alloca i8*, align 8
+// CHECK-CXX-NEXT:    [[AGG_TMP:%.*]] = alloca [[STRUCT_DATA512_T:%.*]], align 8
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = load i8*, i8** @addr, align 8
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = bitcast %struct.data512_t* [[AGG_TMP]] to i8*
+// CHECK-CXX-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP1]], i8* align 8 bitcast (%struct.data512_t* @val to i8*), i64 64, i1 false)
+// CHECK-CXX-NEXT:    store i8* [[TMP0]], i8** [[__ADDR_ADDR_I]], align 8
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[__ADDR_ADDR_I]], align 8
+// CHECK-CXX-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_DATA512_T]], %struct.data512_t* [[AGG_TMP]], i32 0, i32 0
+// CHECK-CXX-NEXT:    [[ARRAYDECAY_I:%.*]] = getelementptr inbounds [8 x i64], [8 x i64]* [[VAL_I]], i64 0, i64 0
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = load i64, i64* [[ARRAYDECAY_I]], align 8
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 1
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = load i64, i64* [[TMP4]], align 8
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 2
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = load i64, i64* [[TMP6]], align 8
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 3
+// CHECK-CXX-NEXT:    [[TMP9:%.*]] = load i64, i64* [[TMP8]], align 8
+// CHECK-CXX-NEXT:    [[TMP10:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 4
+// CHECK-CXX-NEXT:    [[TMP11:%.*]] = load i64, i64* [[TMP10]], align 8
+// CHECK-CXX-NEXT:    [[TMP12:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 5
+// CHECK-CXX-NEXT:    [[TMP13:%.*]] = load i64, i64* [[TMP12]], align 8
+// CHECK-CXX-NEXT:    [[TMP14:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 6
+// CHECK-CXX-NEXT:    [[TMP15:%.*]] = load i64, i64* [[TMP14]], align 8
+// CHECK-CXX-NEXT:    [[TMP16:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 7
+// CHECK-CXX-NEXT:    [[TMP17:%.*]] = load i64, i64* [[TMP16]], align 8
+// CHECK-CXX-NEXT:    [[TMP18:%.*]] = call i64 @llvm.aarch64.st64bv(i8* [[TMP2]], i64 [[TMP3]], i64 [[TMP5]], i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]], i64 [[TMP15]], i64 [[TMP17]]) #[[ATTR2]]
+// CHECK-CXX-NEXT:    store i64 [[TMP18]], i64* @status, align 8
+// CHECK-CXX-NEXT:    ret void
 //
 EXTERN_C void test_st64bv(void)
 {
     status = __arm_st64bv(addr, val);
 }
 
-// CHECK-LABEL: @test_st64bv0(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[__ADDR_ADDR_I:%.*]] = alloca i8*, align 8
-// CHECK-NEXT:    [[AGG_TMP:%.*]] = alloca [[STRUCT_DATA512_T:%.*]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load i8*, i8** @addr, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.data512_t* [[AGG_TMP]] to i8*
-// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP1]], i8* align 8 bitcast (%struct.data512_t* @val to i8*), i64 64, i1 false)
-// CHECK-NEXT:    store i8* [[TMP0]], i8** [[__ADDR_ADDR_I]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[__ADDR_ADDR_I]], align 8
-// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_DATA512_T]], %struct.data512_t* [[AGG_TMP]], i32 0, i32 0
-// CHECK-NEXT:    [[ARRAYDECAY_I:%.*]] = getelementptr inbounds [8 x i64], [8 x i64]* [[VAL_I]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP3:%.*]] = load i64, i64* [[ARRAYDECAY_I]], align 8
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 1
-// CHECK-NEXT:    [[TMP5:%.*]] = load i64, i64* [[TMP4]], align 8
-// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 2
-// CHECK-NEXT:    [[TMP7:%.*]] = load i64, i64* [[TMP6]], align 8
-// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 3
-// CHECK-NEXT:    [[TMP9:%.*]] = load i64, i64* [[TMP8]], align 8
-// CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 4
-// CHECK-NEXT:    [[TMP11:%.*]] = load i64, i64* [[TMP10]], align 8
-// CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 5
-// CHECK-NEXT:    [[TMP13:%.*]] = load i64, i64* [[TMP12]], align 8
-// CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 6
-// CHECK-NEXT:    [[TMP15:%.*]] = load i64, i64* [[TMP14]], align 8
-// CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 7
-// CHECK-NEXT:    [[TMP17:%.*]] = load i64, i64* [[TMP16]], align 8
-// CHECK-NEXT:    [[TMP18:%.*]] = call i64 @llvm.aarch64.st64bv0(i8* [[TMP2]], i64 [[TMP3]], i64 [[TMP5]], i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]], i64 [[TMP15]], i64 [[TMP17]]) [[ATTR2]]
-// CHECK-NEXT:    store i64 [[TMP18]], i64* @status, align 8
-// CHECK-NEXT:    ret void
+// CHECK-C-LABEL: @test_st64bv0(
+// CHECK-C-NEXT:  entry:
+// CHECK-C-NEXT:    [[__ADDR_ADDR_I:%.*]] = alloca i8*, align 8
+// CHECK-C-NEXT:    [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_DATA512_T:%.*]], align 8
+// CHECK-C-NEXT:    [[TMP0:%.*]] = load i8*, i8** @addr, align 8
+// CHECK-C-NEXT:    [[TMP1:%.*]] = bitcast %struct.data512_t* [[BYVAL_TEMP]] to i8*
+// CHECK-C-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP1]], i8* align 8 bitcast (%struct.data512_t* @val to i8*), i64 64, i1 false)
+// CHECK-C-NEXT:    store i8* [[TMP0]], i8** [[__ADDR_ADDR_I]], align 8
+// CHECK-C-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[__ADDR_ADDR_I]], align 8
+// CHECK-C-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_DATA512_T]], %struct.data512_t* [[BYVAL_TEMP]], i32 0, i32 0
+// CHECK-C-NEXT:    [[ARRAYDECAY_I:%.*]] = getelementptr inbounds [8 x i64], [8 x i64]* [[VAL_I]], i64 0, i64 0
+// CHECK-C-NEXT:    [[TMP3:%.*]] = load i64, i64* [[ARRAYDECAY_I]], align 8
+// CHECK-C-NEXT:    [[TMP4:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 1
+// CHECK-C-NEXT:    [[TMP5:%.*]] = load i64, i64* [[TMP4]], align 8
+// CHECK-C-NEXT:    [[TMP6:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 2
+// CHECK-C-NEXT:    [[TMP7:%.*]] = load i64, i64* [[TMP6]], align 8
+// CHECK-C-NEXT:    [[TMP8:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 3
+// CHECK-C-NEXT:    [[TMP9:%.*]] = load i64, i64* [[TMP8]], align 8
+// CHECK-C-NEXT:    [[TMP10:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 4
+// CHECK-C-NEXT:    [[TMP11:%.*]] = load i64, i64* [[TMP10]], align 8
+// CHECK-C-NEXT:    [[TMP12:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 5
+// CHECK-C-NEXT:    [[TMP13:%.*]] = load i64, i64* [[TMP12]], align 8
+// CHECK-C-NEXT:    [[TMP14:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 6
+// CHECK-C-NEXT:    [[TMP15:%.*]] = load i64, i64* [[TMP14]], align 8
+// CHECK-C-NEXT:    [[TMP16:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 7
+// CHECK-C-NEXT:    [[TMP17:%.*]] = load i64, i64* [[TMP16]], align 8
+// CHECK-C-NEXT:    [[TMP18:%.*]] = call i64 @llvm.aarch64.st64bv0(i8* [[TMP2]], i64 [[TMP3]], i64 [[TMP5]], i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]], i64 [[TMP15]], i64 [[TMP17]]) #[[ATTR2]]
+// CHECK-C-NEXT:    store i64 [[TMP18]], i64* @status, align 8
+// CHECK-C-NEXT:    ret void
+//
+// CHECK-CXX-LABEL: @test_st64bv0(
+// CHECK-CXX-NEXT:  entry:
+// CHECK-CXX-NEXT:    [[__ADDR_ADDR_I:%.*]] = alloca i8*, align 8
+// CHECK-CXX-NEXT:    [[AGG_TMP:%.*]] = alloca [[STRUCT_DATA512_T:%.*]], align 8
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = load i8*, i8** @addr, align 8
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = bitcast %struct.data512_t* [[AGG_TMP]] to i8*
+// CHECK-CXX-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP1]], i8* align 8 bitcast (%struct.data512_t* @val to i8*), i64 64, i1 false)
+// CHECK-CXX-NEXT:    store i8* [[TMP0]], i8** [[__ADDR_ADDR_I]], align 8
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = load i8*, i8** [[__ADDR_ADDR_I]], align 8
+// CHECK-CXX-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_DATA512_T]], %struct.data512_t* [[AGG_TMP]], i32 0, i32 0
+// CHECK-CXX-NEXT:    [[ARRAYDECAY_I:%.*]] = getelementptr inbounds [8 x i64], [8 x i64]* [[VAL_I]], i64 0, i64 0
+// CHECK-CXX-NEXT:    [[TMP3:%.*]] = load i64, i64* [[ARRAYDECAY_I]], align 8
+// CHECK-CXX-NEXT:    [[TMP4:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 1
+// CHECK-CXX-NEXT:    [[TMP5:%.*]] = load i64, i64* [[TMP4]], align 8
+// CHECK-CXX-NEXT:    [[TMP6:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 2
+// CHECK-CXX-NEXT:    [[TMP7:%.*]] = load i64, i64* [[TMP6]], align 8
+// CHECK-CXX-NEXT:    [[TMP8:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 3
+// CHECK-CXX-NEXT:    [[TMP9:%.*]] = load i64, i64* [[TMP8]], align 8
+// CHECK-CXX-NEXT:    [[TMP10:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 4
+// CHECK-CXX-NEXT:    [[TMP11:%.*]] = load i64, i64* [[TMP10]], align 8
+// CHECK-CXX-NEXT:    [[TMP12:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 5
+// CHECK-CXX-NEXT:    [[TMP13:%.*]] = load i64, i64* [[TMP12]], align 8
+// CHECK-CXX-NEXT:    [[TMP14:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 6
+// CHECK-CXX-NEXT:    [[TMP15:%.*]] = load i64, i64* [[TMP14]], align 8
+// CHECK-CXX-NEXT:    [[TMP16:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 7
+// CHECK-CXX-NEXT:    [[TMP17:%.*]] = load i64, i64* [[TMP16]], align 8
+// CHECK-CXX-NEXT:    [[TMP18:%.*]] = call i64 @llvm.aarch64.st64bv0(i8* [[TMP2]], i64 [[TMP3]], i64 [[TMP5]], i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]], i64 [[TMP15]], i64 [[TMP17]]) #[[ATTR2]]
+// CHECK-CXX-NEXT:    store i64 [[TMP18]], i64* @status, align 8
+// CHECK-CXX-NEXT:    ret void
 //
 EXTERN_C void test_st64bv0(void)
 {

diff  --git a/clang/test/CodeGen/aarch64-neon-across.c b/clang/test/CodeGen/aarch64-neon-across.c
index ee62b04febb8d..6cc58ca970271 100644
--- a/clang/test/CodeGen/aarch64-neon-across.c
+++ b/clang/test/CodeGen/aarch64-neon-across.c
@@ -1,3 +1,4 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
 // RUN:  -disable-O0-optnone -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
@@ -5,341 +6,471 @@
 
 #include <arm_neon.h>
 
-// CHECK-LABEL: define{{.*}} i16 @test_vaddlv_s8(<8 x i8> noundef %a) #0 {
-// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v8i8(<8 x i8> %a) #3
-// CHECK:   [[TMP0:%.*]] = trunc i32 [[VADDLV_I]] to i16
-// CHECK:   ret i16 [[TMP0]]
+// CHECK-LABEL: define {{[^@]+}}@test_vaddlv_s8
+// CHECK-SAME: (<8 x i8> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v8i8(<8 x i8> [[A]]) #[[ATTR3:[0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VADDLV_I]] to i16
+// CHECK-NEXT:    ret i16 [[TMP0]]
+//
 int16_t test_vaddlv_s8(int8x8_t a) {
   return vaddlv_s8(a);
 }
 
-// CHECK-LABEL: define{{.*}} i32 @test_vaddlv_s16(<4 x i16> noundef %a) #0 {
-// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v4i16(<4 x i16> %a) #3
-// CHECK:   ret i32 [[VADDLV_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vaddlv_s16
+// CHECK-SAME: (<4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v4i16(<4 x i16> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    ret i32 [[VADDLV_I]]
+//
 int32_t test_vaddlv_s16(int16x4_t a) {
   return vaddlv_s16(a);
 }
 
-// CHECK-LABEL: define{{.*}} i16 @test_vaddlv_u8(<8 x i8> noundef %a) #0 {
-// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v8i8(<8 x i8> %a) #3
-// CHECK:   [[TMP0:%.*]] = trunc i32 [[VADDLV_I]] to i16
-// CHECK:   ret i16 [[TMP0]]
+// CHECK-LABEL: define {{[^@]+}}@test_vaddlv_u8
+// CHECK-SAME: (<8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v8i8(<8 x i8> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VADDLV_I]] to i16
+// CHECK-NEXT:    ret i16 [[TMP0]]
+//
 uint16_t test_vaddlv_u8(uint8x8_t a) {
   return vaddlv_u8(a);
 }
 
-// CHECK-LABEL: define{{.*}} i32 @test_vaddlv_u16(<4 x i16> noundef %a) #0 {
-// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v4i16(<4 x i16> %a) #3
-// CHECK:   ret i32 [[VADDLV_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vaddlv_u16
+// CHECK-SAME: (<4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v4i16(<4 x i16> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    ret i32 [[VADDLV_I]]
+//
 uint32_t test_vaddlv_u16(uint16x4_t a) {
   return vaddlv_u16(a);
 }
 
-// CHECK-LABEL: define{{.*}} i16 @test_vaddlvq_s8(<16 x i8> noundef %a) #1 {
-// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v16i8(<16 x i8> %a) #3
-// CHECK:   [[TMP0:%.*]] = trunc i32 [[VADDLV_I]] to i16
-// CHECK:   ret i16 [[TMP0]]
+// CHECK-LABEL: define {{[^@]+}}@test_vaddlvq_s8
+// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v16i8(<16 x i8> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VADDLV_I]] to i16
+// CHECK-NEXT:    ret i16 [[TMP0]]
+//
 int16_t test_vaddlvq_s8(int8x16_t a) {
   return vaddlvq_s8(a);
 }
 
-// CHECK-LABEL: define{{.*}} i32 @test_vaddlvq_s16(<8 x i16> noundef %a) #1 {
-// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v8i16(<8 x i16> %a) #3
-// CHECK:   ret i32 [[VADDLV_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vaddlvq_s16
+// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v8i16(<8 x i16> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    ret i32 [[VADDLV_I]]
+//
 int32_t test_vaddlvq_s16(int16x8_t a) {
   return vaddlvq_s16(a);
 }
 
-// CHECK-LABEL: define{{.*}} i64 @test_vaddlvq_s32(<4 x i32> noundef %a) #1 {
-// CHECK:   [[VADDLVQ_S32_I:%.*]] = call i64 @llvm.aarch64.neon.saddlv.i64.v4i32(<4 x i32> %a) #3
-// CHECK:   ret i64 [[VADDLVQ_S32_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vaddlvq_s32
+// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VADDLVQ_S32_I:%.*]] = call i64 @llvm.aarch64.neon.saddlv.i64.v4i32(<4 x i32> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    ret i64 [[VADDLVQ_S32_I]]
+//
 int64_t test_vaddlvq_s32(int32x4_t a) {
   return vaddlvq_s32(a);
 }
 
-// CHECK-LABEL: define{{.*}} i16 @test_vaddlvq_u8(<16 x i8> noundef %a) #1 {
-// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v16i8(<16 x i8> %a) #3
-// CHECK:   [[TMP0:%.*]] = trunc i32 [[VADDLV_I]] to i16
-// CHECK:   ret i16 [[TMP0]]
+// CHECK-LABEL: define {{[^@]+}}@test_vaddlvq_u8
+// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v16i8(<16 x i8> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VADDLV_I]] to i16
+// CHECK-NEXT:    ret i16 [[TMP0]]
+//
 uint16_t test_vaddlvq_u8(uint8x16_t a) {
   return vaddlvq_u8(a);
 }
 
-// CHECK-LABEL: define{{.*}} i32 @test_vaddlvq_u16(<8 x i16> noundef %a) #1 {
-// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16> %a) #3
-// CHECK:   ret i32 [[VADDLV_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vaddlvq_u16
+// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    ret i32 [[VADDLV_I]]
+//
 uint32_t test_vaddlvq_u16(uint16x8_t a) {
   return vaddlvq_u16(a);
 }
 
-// CHECK-LABEL: define{{.*}} i64 @test_vaddlvq_u32(<4 x i32> noundef %a) #1 {
-// CHECK:   [[VADDLVQ_U32_I:%.*]] = call i64 @llvm.aarch64.neon.uaddlv.i64.v4i32(<4 x i32> %a) #3
-// CHECK:   ret i64 [[VADDLVQ_U32_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vaddlvq_u32
+// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VADDLVQ_U32_I:%.*]] = call i64 @llvm.aarch64.neon.uaddlv.i64.v4i32(<4 x i32> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    ret i64 [[VADDLVQ_U32_I]]
+//
 uint64_t test_vaddlvq_u32(uint32x4_t a) {
   return vaddlvq_u32(a);
 }
 
-// CHECK-LABEL: define{{.*}} i8 @test_vmaxv_s8(<8 x i8> noundef %a) #0 {
-// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v8i8(<8 x i8> %a) #3
-// CHECK:   [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i8
-// CHECK:   ret i8 [[TMP0]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmaxv_s8
+// CHECK-SAME: (<8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v8i8(<8 x i8> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i8
+// CHECK-NEXT:    ret i8 [[TMP0]]
+//
 int8_t test_vmaxv_s8(int8x8_t a) {
   return vmaxv_s8(a);
 }
 
-// CHECK-LABEL: define{{.*}} i16 @test_vmaxv_s16(<4 x i16> noundef %a) #0 {
-// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v4i16(<4 x i16> %a) #3
-// CHECK:   [[TMP2:%.*]] = trunc i32 [[VMAXV_I]] to i16
-// CHECK:   ret i16 [[TMP2]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmaxv_s16
+// CHECK-SAME: (<4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v4i16(<4 x i16> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i16
+// CHECK-NEXT:    ret i16 [[TMP0]]
+//
 int16_t test_vmaxv_s16(int16x4_t a) {
   return vmaxv_s16(a);
 }
 
-// CHECK-LABEL: define{{.*}} i8 @test_vmaxv_u8(<8 x i8> noundef %a) #0 {
-// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8> %a) #3
-// CHECK:   [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i8
-// CHECK:   ret i8 [[TMP0]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmaxv_u8
+// CHECK-SAME: (<8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i8
+// CHECK-NEXT:    ret i8 [[TMP0]]
+//
 uint8_t test_vmaxv_u8(uint8x8_t a) {
   return vmaxv_u8(a);
 }
 
-// CHECK-LABEL: define{{.*}} i16 @test_vmaxv_u16(<4 x i16> noundef %a) #0 {
-// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v4i16(<4 x i16> %a) #3
-// CHECK:   [[TMP2:%.*]] = trunc i32 [[VMAXV_I]] to i16
-// CHECK:   ret i16 [[TMP2]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmaxv_u16
+// CHECK-SAME: (<4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v4i16(<4 x i16> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i16
+// CHECK-NEXT:    ret i16 [[TMP0]]
+//
 uint16_t test_vmaxv_u16(uint16x4_t a) {
   return vmaxv_u16(a);
 }
 
-// CHECK-LABEL: define{{.*}} i8 @test_vmaxvq_s8(<16 x i8> noundef %a) #1 {
-// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v16i8(<16 x i8> %a) #3
-// CHECK:   [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i8
-// CHECK:   ret i8 [[TMP0]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmaxvq_s8
+// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v16i8(<16 x i8> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i8
+// CHECK-NEXT:    ret i8 [[TMP0]]
+//
 int8_t test_vmaxvq_s8(int8x16_t a) {
   return vmaxvq_s8(a);
 }
 
-// CHECK-LABEL: define{{.*}} i16 @test_vmaxvq_s16(<8 x i16> noundef %a) #1 {
-// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v8i16(<8 x i16> %a) #3
-// CHECK:   [[TMP2:%.*]] = trunc i32 [[VMAXV_I]] to i16
-// CHECK:   ret i16 [[TMP2]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmaxvq_s16
+// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v8i16(<8 x i16> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i16
+// CHECK-NEXT:    ret i16 [[TMP0]]
+//
 int16_t test_vmaxvq_s16(int16x8_t a) {
   return vmaxvq_s16(a);
 }
 
-// CHECK-LABEL: define{{.*}} i32 @test_vmaxvq_s32(<4 x i32> noundef %a) #1 {
-// CHECK:   [[VMAXVQ_S32_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v4i32(<4 x i32> %a) #3
-// CHECK:   ret i32 [[VMAXVQ_S32_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmaxvq_s32
+// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VMAXVQ_S32_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v4i32(<4 x i32> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    ret i32 [[VMAXVQ_S32_I]]
+//
 int32_t test_vmaxvq_s32(int32x4_t a) {
   return vmaxvq_s32(a);
 }
 
-// CHECK-LABEL: define{{.*}} i8 @test_vmaxvq_u8(<16 x i8> noundef %a) #1 {
-// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8> %a) #3
-// CHECK:   [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i8
-// CHECK:   ret i8 [[TMP0]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmaxvq_u8
+// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i8
+// CHECK-NEXT:    ret i8 [[TMP0]]
+//
 uint8_t test_vmaxvq_u8(uint8x16_t a) {
   return vmaxvq_u8(a);
 }
 
-// CHECK-LABEL: define{{.*}} i16 @test_vmaxvq_u16(<8 x i16> noundef %a) #1 {
-// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v8i16(<8 x i16> %a) #3
-// CHECK:   [[TMP2:%.*]] = trunc i32 [[VMAXV_I]] to i16
-// CHECK:   ret i16 [[TMP2]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmaxvq_u16
+// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v8i16(<8 x i16> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i16
+// CHECK-NEXT:    ret i16 [[TMP0]]
+//
 uint16_t test_vmaxvq_u16(uint16x8_t a) {
   return vmaxvq_u16(a);
 }
 
-// CHECK-LABEL: define{{.*}} i32 @test_vmaxvq_u32(<4 x i32> noundef %a) #1 {
-// CHECK:   [[VMAXVQ_U32_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v4i32(<4 x i32> %a) #3
-// CHECK:   ret i32 [[VMAXVQ_U32_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmaxvq_u32
+// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VMAXVQ_U32_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v4i32(<4 x i32> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    ret i32 [[VMAXVQ_U32_I]]
+//
 uint32_t test_vmaxvq_u32(uint32x4_t a) {
   return vmaxvq_u32(a);
 }
 
-// CHECK-LABEL: define{{.*}} i8 @test_vminv_s8(<8 x i8> noundef %a) #0 {
-// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v8i8(<8 x i8> %a) #3
-// CHECK:   [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i8
-// CHECK:   ret i8 [[TMP0]]
+// CHECK-LABEL: define {{[^@]+}}@test_vminv_s8
+// CHECK-SAME: (<8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v8i8(<8 x i8> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i8
+// CHECK-NEXT:    ret i8 [[TMP0]]
+//
 int8_t test_vminv_s8(int8x8_t a) {
   return vminv_s8(a);
 }
 
-// CHECK-LABEL: define{{.*}} i16 @test_vminv_s16(<4 x i16> noundef %a) #0 {
-// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v4i16(<4 x i16> %a) #3
-// CHECK:   [[TMP2:%.*]] = trunc i32 [[VMINV_I]] to i16
-// CHECK:   ret i16 [[TMP2]]
+// CHECK-LABEL: define {{[^@]+}}@test_vminv_s16
+// CHECK-SAME: (<4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v4i16(<4 x i16> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i16
+// CHECK-NEXT:    ret i16 [[TMP0]]
+//
 int16_t test_vminv_s16(int16x4_t a) {
   return vminv_s16(a);
 }
 
-// CHECK-LABEL: define{{.*}} i8 @test_vminv_u8(<8 x i8> noundef %a) #0 {
-// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8> %a) #3
-// CHECK:   [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i8
-// CHECK:   ret i8 [[TMP0]]
+// CHECK-LABEL: define {{[^@]+}}@test_vminv_u8
+// CHECK-SAME: (<8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i8
+// CHECK-NEXT:    ret i8 [[TMP0]]
+//
 uint8_t test_vminv_u8(uint8x8_t a) {
   return vminv_u8(a);
 }
 
-// CHECK-LABEL: define{{.*}} i16 @test_vminv_u16(<4 x i16> noundef %a) #0 {
-// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v4i16(<4 x i16> %a) #3
-// CHECK:   [[TMP2:%.*]] = trunc i32 [[VMINV_I]] to i16
-// CHECK:   ret i16 [[TMP2]]
+// CHECK-LABEL: define {{[^@]+}}@test_vminv_u16
+// CHECK-SAME: (<4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v4i16(<4 x i16> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i16
+// CHECK-NEXT:    ret i16 [[TMP0]]
+//
 uint16_t test_vminv_u16(uint16x4_t a) {
   return vminv_u16(a);
 }
 
-// CHECK-LABEL: define{{.*}} i8 @test_vminvq_s8(<16 x i8> noundef %a) #1 {
-// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v16i8(<16 x i8> %a) #3
-// CHECK:   [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i8
-// CHECK:   ret i8 [[TMP0]]
+// CHECK-LABEL: define {{[^@]+}}@test_vminvq_s8
+// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v16i8(<16 x i8> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i8
+// CHECK-NEXT:    ret i8 [[TMP0]]
+//
 int8_t test_vminvq_s8(int8x16_t a) {
   return vminvq_s8(a);
 }
 
-// CHECK-LABEL: define{{.*}} i16 @test_vminvq_s16(<8 x i16> noundef %a) #1 {
-// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v8i16(<8 x i16> %a) #3
-// CHECK:   [[TMP2:%.*]] = trunc i32 [[VMINV_I]] to i16
-// CHECK:   ret i16 [[TMP2]]
+// CHECK-LABEL: define {{[^@]+}}@test_vminvq_s16
+// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v8i16(<8 x i16> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i16
+// CHECK-NEXT:    ret i16 [[TMP0]]
+//
 int16_t test_vminvq_s16(int16x8_t a) {
   return vminvq_s16(a);
 }
 
-// CHECK-LABEL: define{{.*}} i32 @test_vminvq_s32(<4 x i32> noundef %a) #1 {
-// CHECK:   [[VMINVQ_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v4i32(<4 x i32> %a) #3
-// CHECK:   ret i32 [[VMINVQ_S32_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vminvq_s32
+// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VMINVQ_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v4i32(<4 x i32> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    ret i32 [[VMINVQ_S32_I]]
+//
 int32_t test_vminvq_s32(int32x4_t a) {
   return vminvq_s32(a);
 }
 
-// CHECK-LABEL: define{{.*}} i8 @test_vminvq_u8(<16 x i8> noundef %a) #1 {
-// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8> %a) #3
-// CHECK:   [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i8
-// CHECK:   ret i8 [[TMP0]]
+// CHECK-LABEL: define {{[^@]+}}@test_vminvq_u8
+// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i8
+// CHECK-NEXT:    ret i8 [[TMP0]]
+//
 uint8_t test_vminvq_u8(uint8x16_t a) {
   return vminvq_u8(a);
 }
 
-// CHECK-LABEL: define{{.*}} i16 @test_vminvq_u16(<8 x i16> noundef %a) #1 {
-// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v8i16(<8 x i16> %a) #3
-// CHECK:   [[TMP2:%.*]] = trunc i32 [[VMINV_I]] to i16
-// CHECK:   ret i16 [[TMP2]]
+// CHECK-LABEL: define {{[^@]+}}@test_vminvq_u16
+// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v8i16(<8 x i16> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i16
+// CHECK-NEXT:    ret i16 [[TMP0]]
+//
 uint16_t test_vminvq_u16(uint16x8_t a) {
   return vminvq_u16(a);
 }
 
-// CHECK-LABEL: define{{.*}} i32 @test_vminvq_u32(<4 x i32> noundef %a) #1 {
-// CHECK:   [[VMINVQ_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v4i32(<4 x i32> %a) #3
-// CHECK:   ret i32 [[VMINVQ_U32_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vminvq_u32
+// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VMINVQ_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v4i32(<4 x i32> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    ret i32 [[VMINVQ_U32_I]]
+//
 uint32_t test_vminvq_u32(uint32x4_t a) {
   return vminvq_u32(a);
 }
 
-// CHECK-LABEL: define{{.*}} i8 @test_vaddv_s8(<8 x i8> noundef %a) #0 {
-// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8> %a) #3
-// CHECK:   [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i8
-// CHECK:   ret i8 [[TMP0]]
+// CHECK-LABEL: define {{[^@]+}}@test_vaddv_s8
+// CHECK-SAME: (<8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i8
+// CHECK-NEXT:    ret i8 [[TMP0]]
+//
 int8_t test_vaddv_s8(int8x8_t a) {
   return vaddv_s8(a);
 }
 
-// CHECK-LABEL: define{{.*}} i16 @test_vaddv_s16(<4 x i16> noundef %a) #0 {
-// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16> %a) #3
-// CHECK:   [[TMP2:%.*]] = trunc i32 [[VADDV_I]] to i16
-// CHECK:   ret i16 [[TMP2]]
+// CHECK-LABEL: define {{[^@]+}}@test_vaddv_s16
+// CHECK-SAME: (<4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i16
+// CHECK-NEXT:    ret i16 [[TMP0]]
+//
 int16_t test_vaddv_s16(int16x4_t a) {
   return vaddv_s16(a);
 }
 
-// CHECK-LABEL: define{{.*}} i8 @test_vaddv_u8(<8 x i8> noundef %a) #0 {
-// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8> %a) #3
-// CHECK:   [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i8
-// CHECK:   ret i8 [[TMP0]]
+// CHECK-LABEL: define {{[^@]+}}@test_vaddv_u8
+// CHECK-SAME: (<8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i8
+// CHECK-NEXT:    ret i8 [[TMP0]]
+//
 uint8_t test_vaddv_u8(uint8x8_t a) {
   return vaddv_u8(a);
 }
 
-// CHECK-LABEL: define{{.*}} i16 @test_vaddv_u16(<4 x i16> noundef %a) #0 {
-// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16> %a) #3
-// CHECK:   [[TMP2:%.*]] = trunc i32 [[VADDV_I]] to i16
-// CHECK:   ret i16 [[TMP2]]
+// CHECK-LABEL: define {{[^@]+}}@test_vaddv_u16
+// CHECK-SAME: (<4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i16
+// CHECK-NEXT:    ret i16 [[TMP0]]
+//
 uint16_t test_vaddv_u16(uint16x4_t a) {
   return vaddv_u16(a);
 }
 
-// CHECK-LABEL: define{{.*}} i8 @test_vaddvq_s8(<16 x i8> noundef %a) #1 {
-// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8> %a) #3
-// CHECK:   [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i8
-// CHECK:   ret i8 [[TMP0]]
+// CHECK-LABEL: define {{[^@]+}}@test_vaddvq_s8
+// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i8
+// CHECK-NEXT:    ret i8 [[TMP0]]
+//
 int8_t test_vaddvq_s8(int8x16_t a) {
   return vaddvq_s8(a);
 }
 
-// CHECK-LABEL: define{{.*}} i16 @test_vaddvq_s16(<8 x i16> noundef %a) #1 {
-// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16> %a) #3
-// CHECK:   [[TMP2:%.*]] = trunc i32 [[VADDV_I]] to i16
-// CHECK:   ret i16 [[TMP2]]
+// CHECK-LABEL: define {{[^@]+}}@test_vaddvq_s16
+// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i16
+// CHECK-NEXT:    ret i16 [[TMP0]]
+//
 int16_t test_vaddvq_s16(int16x8_t a) {
   return vaddvq_s16(a);
 }
 
-// CHECK-LABEL: define{{.*}} i32 @test_vaddvq_s32(<4 x i32> noundef %a) #1 {
-// CHECK:   [[VADDVQ_S32_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32> %a) #3
-// CHECK:   ret i32 [[VADDVQ_S32_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vaddvq_s32
+// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VADDVQ_S32_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    ret i32 [[VADDVQ_S32_I]]
+//
 int32_t test_vaddvq_s32(int32x4_t a) {
   return vaddvq_s32(a);
 }
 
-// CHECK-LABEL: define{{.*}} i8 @test_vaddvq_u8(<16 x i8> noundef %a) #1 {
-// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v16i8(<16 x i8> %a) #3
-// CHECK:   [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i8
-// CHECK:   ret i8 [[TMP0]]
+// CHECK-LABEL: define {{[^@]+}}@test_vaddvq_u8
+// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v16i8(<16 x i8> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i8
+// CHECK-NEXT:    ret i8 [[TMP0]]
+//
 uint8_t test_vaddvq_u8(uint8x16_t a) {
   return vaddvq_u8(a);
 }
 
-// CHECK-LABEL: define{{.*}} i16 @test_vaddvq_u16(<8 x i16> noundef %a) #1 {
-// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v8i16(<8 x i16> %a) #3
-// CHECK:   [[TMP2:%.*]] = trunc i32 [[VADDV_I]] to i16
-// CHECK:   ret i16 [[TMP2]]
+// CHECK-LABEL: define {{[^@]+}}@test_vaddvq_u16
+// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v8i16(<8 x i16> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i16
+// CHECK-NEXT:    ret i16 [[TMP0]]
+//
 uint16_t test_vaddvq_u16(uint16x8_t a) {
   return vaddvq_u16(a);
 }
 
-// CHECK-LABEL: define{{.*}} i32 @test_vaddvq_u32(<4 x i32> noundef %a) #1 {
-// CHECK:   [[VADDVQ_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v4i32(<4 x i32> %a) #3
-// CHECK:   ret i32 [[VADDVQ_U32_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vaddvq_u32
+// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VADDVQ_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v4i32(<4 x i32> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    ret i32 [[VADDVQ_U32_I]]
+//
 uint32_t test_vaddvq_u32(uint32x4_t a) {
   return vaddvq_u32(a);
 }
 
-// CHECK-LABEL: define{{.*}} float @test_vmaxvq_f32(<4 x float> noundef %a) #1 {
-// CHECK:   [[VMAXVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxv.f32.v4f32(<4 x float> %a) #3
-// CHECK:   ret float [[VMAXVQ_F32_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmaxvq_f32
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VMAXVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxv.f32.v4f32(<4 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    ret float [[VMAXVQ_F32_I]]
+//
 float32_t test_vmaxvq_f32(float32x4_t a) {
   return vmaxvq_f32(a);
 }
 
-// CHECK-LABEL: define{{.*}} float @test_vminvq_f32(<4 x float> noundef %a) #1 {
-// CHECK:   [[VMINVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fminv.f32.v4f32(<4 x float> %a) #3
-// CHECK:   ret float [[VMINVQ_F32_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vminvq_f32
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VMINVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fminv.f32.v4f32(<4 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    ret float [[VMINVQ_F32_I]]
+//
 float32_t test_vminvq_f32(float32x4_t a) {
   return vminvq_f32(a);
 }
 
-// CHECK-LABEL: define{{.*}} float @test_vmaxnmvq_f32(<4 x float> noundef %a) #1 {
-// CHECK:   [[VMAXNMVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxnmv.f32.v4f32(<4 x float> %a) #3
-// CHECK:   ret float [[VMAXNMVQ_F32_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmaxnmvq_f32
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VMAXNMVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxnmv.f32.v4f32(<4 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    ret float [[VMAXNMVQ_F32_I]]
+//
 float32_t test_vmaxnmvq_f32(float32x4_t a) {
   return vmaxnmvq_f32(a);
 }
 
-// CHECK-LABEL: define{{.*}} float @test_vminnmvq_f32(<4 x float> noundef %a) #1 {
-// CHECK:   [[VMINNMVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fminnmv.f32.v4f32(<4 x float> %a) #3
-// CHECK:   ret float [[VMINNMVQ_F32_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vminnmvq_f32
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VMINNMVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fminnmv.f32.v4f32(<4 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    ret float [[VMINNMVQ_F32_I]]
+//
 float32_t test_vminnmvq_f32(float32x4_t a) {
   return vminnmvq_f32(a);
 }
 
-// CHECK: attributes #0 ={{.*}}"min-legal-vector-width"="64"
-// CHECK: attributes #1 ={{.*}}"min-legal-vector-width"="128"

diff  --git a/clang/test/CodeGen/aarch64-neon-fcvt-intrinsics.c b/clang/test/CodeGen/aarch64-neon-fcvt-intrinsics.c
index 571a67fbf19db..e2407544e70a2 100644
--- a/clang/test/CodeGen/aarch64-neon-fcvt-intrinsics.c
+++ b/clang/test/CodeGen/aarch64-neon-fcvt-intrinsics.c
@@ -1,3 +1,4 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
 // RUN:  -disable-O0-optnone -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
@@ -5,149 +6,212 @@
 
 #include <arm_neon.h>
 
-// CHECK-LABEL: define{{.*}} float @test_vcvtxd_f32_f64(double noundef %a) #0 {
-// CHECK:   [[VCVTXD_F32_F64_I:%.*]] = call float @llvm.aarch64.sisd.fcvtxn(double %a) #2
-// CHECK:   ret float [[VCVTXD_F32_F64_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtxd_f32_f64
+// CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VCVTXD_F32_F64_I:%.*]] = call float @llvm.aarch64.sisd.fcvtxn(double [[A]]) #[[ATTR2:[0-9]+]]
+// CHECK-NEXT:    ret float [[VCVTXD_F32_F64_I]]
+//
 float32_t test_vcvtxd_f32_f64(float64_t a) {
   return (float32_t)vcvtxd_f32_f64(a);
 }
 
-// CHECK-LABEL: define{{.*}} i32 @test_vcvtas_s32_f32(float noundef %a) #0 {
-// CHECK:   [[VCVTAS_S32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtas.i32.f32(float %a) #2
-// CHECK:   ret i32 [[VCVTAS_S32_F32_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtas_s32_f32
+// CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VCVTAS_S32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtas.i32.f32(float [[A]]) #[[ATTR2]]
+// CHECK-NEXT:    ret i32 [[VCVTAS_S32_F32_I]]
+//
 int32_t test_vcvtas_s32_f32(float32_t a) {
   return (int32_t)vcvtas_s32_f32(a);
 }
 
-// CHECK-LABEL: define{{.*}} i64 @test_test_vcvtad_s64_f64(double noundef %a) #0 {
-// CHECK:   [[VCVTAD_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtas.i64.f64(double %a) #2
-// CHECK:   ret i64 [[VCVTAD_S64_F64_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_test_vcvtad_s64_f64
+// CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VCVTAD_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtas.i64.f64(double [[A]]) #[[ATTR2]]
+// CHECK-NEXT:    ret i64 [[VCVTAD_S64_F64_I]]
+//
 int64_t test_test_vcvtad_s64_f64(float64_t a) {
   return (int64_t)vcvtad_s64_f64(a);
 }
 
-// CHECK-LABEL: define{{.*}} i32 @test_vcvtas_u32_f32(float noundef %a) #0 {
-// CHECK:   [[VCVTAS_U32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtau.i32.f32(float %a) #2
-// CHECK:   ret i32 [[VCVTAS_U32_F32_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtas_u32_f32
+// CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VCVTAS_U32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtau.i32.f32(float [[A]]) #[[ATTR2]]
+// CHECK-NEXT:    ret i32 [[VCVTAS_U32_F32_I]]
+//
 uint32_t test_vcvtas_u32_f32(float32_t a) {
   return (uint32_t)vcvtas_u32_f32(a);
 }
 
-// CHECK-LABEL: define{{.*}} i64 @test_vcvtad_u64_f64(double noundef %a) #0 {
-// CHECK:   [[VCVTAD_U64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtau.i64.f64(double %a) #2
-// CHECK:   ret i64 [[VCVTAD_U64_F64_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtad_u64_f64
+// CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VCVTAD_U64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtau.i64.f64(double [[A]]) #[[ATTR2]]
+// CHECK-NEXT:    ret i64 [[VCVTAD_U64_F64_I]]
+//
 uint64_t test_vcvtad_u64_f64(float64_t a) {
   return (uint64_t)vcvtad_u64_f64(a);
 }
 
-// CHECK-LABEL: define{{.*}} i32 @test_vcvtms_s32_f32(float noundef %a) #0 {
-// CHECK:   [[VCVTMS_S32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtms.i32.f32(float %a) #2
-// CHECK:   ret i32 [[VCVTMS_S32_F32_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtms_s32_f32
+// CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VCVTMS_S32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtms.i32.f32(float [[A]]) #[[ATTR2]]
+// CHECK-NEXT:    ret i32 [[VCVTMS_S32_F32_I]]
+//
 int32_t test_vcvtms_s32_f32(float32_t a) {
   return (int32_t)vcvtms_s32_f32(a);
 }
 
-// CHECK-LABEL: define{{.*}} i64 @test_vcvtmd_s64_f64(double noundef %a) #0 {
-// CHECK:   [[VCVTMD_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtms.i64.f64(double %a) #2
-// CHECK:   ret i64 [[VCVTMD_S64_F64_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtmd_s64_f64
+// CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VCVTMD_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtms.i64.f64(double [[A]]) #[[ATTR2]]
+// CHECK-NEXT:    ret i64 [[VCVTMD_S64_F64_I]]
+//
 int64_t test_vcvtmd_s64_f64(float64_t a) {
   return (int64_t)vcvtmd_s64_f64(a);
 }
 
-// CHECK-LABEL: define{{.*}} i32 @test_vcvtms_u32_f32(float noundef %a) #0 {
-// CHECK:   [[VCVTMS_U32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtmu.i32.f32(float %a) #2
-// CHECK:   ret i32 [[VCVTMS_U32_F32_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtms_u32_f32
+// CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VCVTMS_U32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtmu.i32.f32(float [[A]]) #[[ATTR2]]
+// CHECK-NEXT:    ret i32 [[VCVTMS_U32_F32_I]]
+//
 uint32_t test_vcvtms_u32_f32(float32_t a) {
   return (uint32_t)vcvtms_u32_f32(a);
 }
 
-// CHECK-LABEL: define{{.*}} i64 @test_vcvtmd_u64_f64(double noundef %a) #0 {
-// CHECK:   [[VCVTMD_U64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtmu.i64.f64(double %a) #2
-// CHECK:   ret i64 [[VCVTMD_U64_F64_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtmd_u64_f64
+// CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VCVTMD_U64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtmu.i64.f64(double [[A]]) #[[ATTR2]]
+// CHECK-NEXT:    ret i64 [[VCVTMD_U64_F64_I]]
+//
 uint64_t test_vcvtmd_u64_f64(float64_t a) {
   return (uint64_t)vcvtmd_u64_f64(a);
 }
 
-// CHECK-LABEL: define{{.*}} i32 @test_vcvtns_s32_f32(float noundef %a) #0 {
-// CHECK:   [[VCVTNS_S32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtns.i32.f32(float %a) #2
-// CHECK:   ret i32 [[VCVTNS_S32_F32_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtns_s32_f32
+// CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VCVTNS_S32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtns.i32.f32(float [[A]]) #[[ATTR2]]
+// CHECK-NEXT:    ret i32 [[VCVTNS_S32_F32_I]]
+//
 int32_t test_vcvtns_s32_f32(float32_t a) {
   return (int32_t)vcvtns_s32_f32(a);
 }
 
-// CHECK-LABEL: define{{.*}} i64 @test_vcvtnd_s64_f64(double noundef %a) #0 {
-// CHECK:   [[VCVTND_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtns.i64.f64(double %a) #2
-// CHECK:   ret i64 [[VCVTND_S64_F64_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtnd_s64_f64
+// CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VCVTND_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtns.i64.f64(double [[A]]) #[[ATTR2]]
+// CHECK-NEXT:    ret i64 [[VCVTND_S64_F64_I]]
+//
 int64_t test_vcvtnd_s64_f64(float64_t a) {
   return (int64_t)vcvtnd_s64_f64(a);
 }
 
-// CHECK-LABEL: define{{.*}} i32 @test_vcvtns_u32_f32(float noundef %a) #0 {
-// CHECK:   [[VCVTNS_U32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtnu.i32.f32(float %a) #2
-// CHECK:   ret i32 [[VCVTNS_U32_F32_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtns_u32_f32
+// CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VCVTNS_U32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtnu.i32.f32(float [[A]]) #[[ATTR2]]
+// CHECK-NEXT:    ret i32 [[VCVTNS_U32_F32_I]]
+//
 uint32_t test_vcvtns_u32_f32(float32_t a) {
   return (uint32_t)vcvtns_u32_f32(a);
 }
 
-// CHECK-LABEL: define{{.*}} i64 @test_vcvtnd_u64_f64(double noundef %a) #0 {
-// CHECK:   [[VCVTND_U64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtnu.i64.f64(double %a) #2
-// CHECK:   ret i64 [[VCVTND_U64_F64_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtnd_u64_f64
+// CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VCVTND_U64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtnu.i64.f64(double [[A]]) #[[ATTR2]]
+// CHECK-NEXT:    ret i64 [[VCVTND_U64_F64_I]]
+//
 uint64_t test_vcvtnd_u64_f64(float64_t a) {
   return (uint64_t)vcvtnd_u64_f64(a);
 }
 
-// CHECK-LABEL: define{{.*}} i32 @test_vcvtps_s32_f32(float noundef %a) #0 {
-// CHECK:   [[VCVTPS_S32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtps.i32.f32(float %a) #2
-// CHECK:   ret i32 [[VCVTPS_S32_F32_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtps_s32_f32
+// CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VCVTPS_S32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtps.i32.f32(float [[A]]) #[[ATTR2]]
+// CHECK-NEXT:    ret i32 [[VCVTPS_S32_F32_I]]
+//
 int32_t test_vcvtps_s32_f32(float32_t a) {
   return (int32_t)vcvtps_s32_f32(a);
 }
 
-// CHECK-LABEL: define{{.*}} i64 @test_vcvtpd_s64_f64(double noundef %a) #0 {
-// CHECK:   [[VCVTPD_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtps.i64.f64(double %a) #2
-// CHECK:   ret i64 [[VCVTPD_S64_F64_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtpd_s64_f64
+// CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VCVTPD_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtps.i64.f64(double [[A]]) #[[ATTR2]]
+// CHECK-NEXT:    ret i64 [[VCVTPD_S64_F64_I]]
+//
 int64_t test_vcvtpd_s64_f64(float64_t a) {
   return (int64_t)vcvtpd_s64_f64(a);
 }
 
-// CHECK-LABEL: define{{.*}} i32 @test_vcvtps_u32_f32(float noundef %a) #0 {
-// CHECK:   [[VCVTPS_U32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtpu.i32.f32(float %a) #2
-// CHECK:   ret i32 [[VCVTPS_U32_F32_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtps_u32_f32
+// CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VCVTPS_U32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtpu.i32.f32(float [[A]]) #[[ATTR2]]
+// CHECK-NEXT:    ret i32 [[VCVTPS_U32_F32_I]]
+//
 uint32_t test_vcvtps_u32_f32(float32_t a) {
   return (uint32_t)vcvtps_u32_f32(a);
 }
 
-// CHECK-LABEL: define{{.*}} i64 @test_vcvtpd_u64_f64(double noundef %a) #0 {
-// CHECK:   [[VCVTPD_U64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtpu.i64.f64(double %a) #2
-// CHECK:   ret i64 [[VCVTPD_U64_F64_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtpd_u64_f64
+// CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VCVTPD_U64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtpu.i64.f64(double [[A]]) #[[ATTR2]]
+// CHECK-NEXT:    ret i64 [[VCVTPD_U64_F64_I]]
+//
 uint64_t test_vcvtpd_u64_f64(float64_t a) {
   return (uint64_t)vcvtpd_u64_f64(a);
 }
 
-// CHECK-LABEL: define{{.*}} i32 @test_vcvts_s32_f32(float noundef %a) #0 {
-// CHECK:   [[TMP0:%.*]] = call i32 @llvm.aarch64.neon.fcvtzs.i32.f32(float %a)
-// CHECK:   ret i32 [[TMP0]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvts_s32_f32
+// CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VCVTS_S32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtzs.i32.f32(float [[A]]) #[[ATTR2]]
+// CHECK-NEXT:    ret i32 [[VCVTS_S32_F32_I]]
+//
 int32_t test_vcvts_s32_f32(float32_t a) {
   return (int32_t)vcvts_s32_f32(a);
 }
 
-// CHECK-LABEL: define{{.*}} i64 @test_vcvtd_s64_f64(double noundef %a) #0 {
-// CHECK:   [[TMP0:%.*]] = call i64 @llvm.aarch64.neon.fcvtzs.i64.f64(double %a)
-// CHECK:   ret i64 [[TMP0]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtd_s64_f64
+// CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VCVTD_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtzs.i64.f64(double [[A]]) #[[ATTR2]]
+// CHECK-NEXT:    ret i64 [[VCVTD_S64_F64_I]]
+//
 int64_t test_vcvtd_s64_f64(float64_t a) {
   return (int64_t)vcvtd_s64_f64(a);
 }
 
-// CHECK-LABEL: define{{.*}} i32 @test_vcvts_u32_f32(float noundef %a) #0 {
-// CHECK:   [[TMP0:%.*]] = call i32 @llvm.aarch64.neon.fcvtzu.i32.f32(float %a)
-// CHECK:   ret i32 [[TMP0]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvts_u32_f32
+// CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VCVTS_U32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtzu.i32.f32(float [[A]]) #[[ATTR2]]
+// CHECK-NEXT:    ret i32 [[VCVTS_U32_F32_I]]
+//
 uint32_t test_vcvts_u32_f32(float32_t a) {
   return (uint32_t)vcvts_u32_f32(a);
 }
 
-// CHECK-LABEL: define{{.*}} i64 @test_vcvtd_u64_f64(double noundef %a) #0 {
-// CHECK:   [[TMP0:%.*]] = call i64 @llvm.aarch64.neon.fcvtzu.i64.f64(double %a)
-// CHECK:   ret i64 [[TMP0]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtd_u64_f64
+// CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VCVTD_U64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtzu.i64.f64(double [[A]]) #[[ATTR2]]
+// CHECK-NEXT:    ret i64 [[VCVTD_U64_F64_I]]
+//
 uint64_t test_vcvtd_u64_f64(float64_t a) {
   return (uint64_t)vcvtd_u64_f64(a);
 }

diff  --git a/clang/test/CodeGen/aarch64-neon-fma.c b/clang/test/CodeGen/aarch64-neon-fma.c
index 3c964558b3f4d..3498fdab97638 100644
--- a/clang/test/CodeGen/aarch64-neon-fma.c
+++ b/clang/test/CodeGen/aarch64-neon-fma.c
@@ -1,247 +1,317 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -S -disable-O0-optnone -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
 #include <arm_neon.h>
 
-// CHECK-LABEL: define{{.*}} <2 x float> @test_vmla_n_f32(<2 x float> noundef %a, <2 x float> noundef %b, float noundef %c) #0 {
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %c, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %c, i32 1
-// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %b, [[VECINIT1_I]]
-// CHECK:   [[ADD_I:%.*]] = fadd <2 x float> %a, [[MUL_I]]
-// CHECK:   ret <2 x float> [[ADD_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmla_n_f32
+// CHECK-SAME: (<2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], float noundef [[C:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float [[C]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float [[C]], i32 1
+// CHECK-NEXT:    [[MUL_I:%.*]] = fmul <2 x float> [[B]], [[VECINIT1_I]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = fadd <2 x float> [[A]], [[MUL_I]]
+// CHECK-NEXT:    ret <2 x float> [[ADD_I]]
+//
 float32x2_t test_vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c) {
   return vmla_n_f32(a, b, c);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x float> @test_vmlaq_n_f32(<4 x float> noundef %a, <4 x float> noundef %b, float noundef %c) #1 {
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %c, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %c, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %c, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %c, i32 3
-// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %b, [[VECINIT3_I]]
-// CHECK:   [[ADD_I:%.*]] = fadd <4 x float> %a, [[MUL_I]]
-// CHECK:   ret <4 x float> [[ADD_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmlaq_n_f32
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], float noundef [[C:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float [[C]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[C]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[C]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[C]], i32 3
+// CHECK-NEXT:    [[MUL_I:%.*]] = fmul <4 x float> [[B]], [[VECINIT3_I]]
+// CHECK-NEXT:    [[ADD_I:%.*]] = fadd <4 x float> [[A]], [[MUL_I]]
+// CHECK-NEXT:    ret <4 x float> [[ADD_I]]
+//
 float32x4_t test_vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c) {
   return vmlaq_n_f32(a, b, c);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x float> @test_vmlsq_n_f32(<4 x float> noundef %a, <4 x float> noundef %b, float noundef %c) #1 {
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %c, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %c, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %c, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %c, i32 3
-// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %b, [[VECINIT3_I]]
-// CHECK:   [[SUB_I:%.*]] = fsub <4 x float> %a, [[MUL_I]]
-// CHECK:   ret <4 x float> [[SUB_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmlsq_n_f32
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], float noundef [[C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float [[C]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[C]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[C]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[C]], i32 3
+// CHECK-NEXT:    [[MUL_I:%.*]] = fmul <4 x float> [[B]], [[VECINIT3_I]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = fsub <4 x float> [[A]], [[MUL_I]]
+// CHECK-NEXT:    ret <4 x float> [[SUB_I]]
+//
 float32x4_t test_vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c) {
   return vmlsq_n_f32(a, b, c);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x float> @test_vmls_n_f32(<2 x float> noundef %a, <2 x float> noundef %b, float noundef %c) #0 {
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %c, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %c, i32 1
-// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %b, [[VECINIT1_I]]
-// CHECK:   [[SUB_I:%.*]] = fsub <2 x float> %a, [[MUL_I]]
-// CHECK:   ret <2 x float> [[SUB_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmls_n_f32
+// CHECK-SAME: (<2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], float noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float [[C]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float [[C]], i32 1
+// CHECK-NEXT:    [[MUL_I:%.*]] = fmul <2 x float> [[B]], [[VECINIT1_I]]
+// CHECK-NEXT:    [[SUB_I:%.*]] = fsub <2 x float> [[A]], [[MUL_I]]
+// CHECK-NEXT:    ret <2 x float> [[SUB_I]]
+//
 float32x2_t test_vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c) {
   return vmls_n_f32(a, b, c);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x float> @test_vmla_lane_f32_0(<2 x float> noundef %a, <2 x float> noundef %b, <2 x float> noundef %v) #0 {
-// CHECK:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
-// CHECK:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> zeroinitializer
-// CHECK:    [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]]
-// CHECK:    [[ADD:%.*]] = fadd <2 x float> [[A:%.*]], [[MUL]]
-// CHECK:    ret <2 x float> [[ADD]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmla_lane_f32_0
+// CHECK-SAME: (<2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <2 x float> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x float> [[B]], [[LANE]]
+// CHECK-NEXT:    [[ADD:%.*]] = fadd <2 x float> [[A]], [[MUL]]
+// CHECK-NEXT:    ret <2 x float> [[ADD]]
+//
 float32x2_t test_vmla_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
   return vmla_lane_f32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x float> @test_vmlaq_lane_f32_0(<4 x float> noundef %a, <4 x float> noundef %b, <2 x float> noundef %v) #1 {
-// CHECK:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
-// CHECK:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> zeroinitializer
-// CHECK:    [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]]
-// CHECK:    [[ADD:%.*]] = fadd <4 x float> [[A:%.*]], [[MUL]]
-// CHECK:    ret <4 x float> [[ADD]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmlaq_lane_f32_0
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <2 x float> noundef [[V:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x float> [[B]], [[LANE]]
+// CHECK-NEXT:    [[ADD:%.*]] = fadd <4 x float> [[A]], [[MUL]]
+// CHECK-NEXT:    ret <4 x float> [[ADD]]
+//
 float32x4_t test_vmlaq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
   return vmlaq_lane_f32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x float> @test_vmla_laneq_f32_0(<2 x float> noundef %a, <2 x float> noundef %b, <4 x float> noundef %v) #1 {
-// CHECK:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
-// CHECK:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> zeroinitializer
-// CHECK:    [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]]
-// CHECK:    [[ADD:%.*]] = fadd <2 x float> [[A:%.*]], [[MUL]]
-// CHECK:    ret <2 x float> [[ADD]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmla_laneq_f32_0
+// CHECK-SAME: (<2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <4 x float> noundef [[V:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x float> [[B]], [[LANE]]
+// CHECK-NEXT:    [[ADD:%.*]] = fadd <2 x float> [[A]], [[MUL]]
+// CHECK-NEXT:    ret <2 x float> [[ADD]]
+//
 float32x2_t test_vmla_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
   return vmla_laneq_f32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x float> @test_vmlaq_laneq_f32_0(<4 x float> noundef %a, <4 x float> noundef %b, <4 x float> noundef %v) #1 {
-// CHECK:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
-// CHECK:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> zeroinitializer
-// CHECK:    [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]]
-// CHECK:    [[ADD:%.*]] = fadd <4 x float> [[A:%.*]], [[MUL]]
-// CHECK:    ret <4 x float> [[ADD]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmlaq_laneq_f32_0
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <4 x float> noundef [[V:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x float> [[B]], [[LANE]]
+// CHECK-NEXT:    [[ADD:%.*]] = fadd <4 x float> [[A]], [[MUL]]
+// CHECK-NEXT:    ret <4 x float> [[ADD]]
+//
 float32x4_t test_vmlaq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) {
   return vmlaq_laneq_f32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x float> @test_vmls_lane_f32_0(<2 x float> noundef %a, <2 x float> noundef %b, <2 x float> noundef %v) #0 {
-// CHECK:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
-// CHECK:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> zeroinitializer
-// CHECK:    [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]]
-// CHECK:    [[SUB:%.*]] = fsub <2 x float> [[A:%.*]], [[MUL]]
-// CHECK:    ret <2 x float> [[SUB]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmls_lane_f32_0
+// CHECK-SAME: (<2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <2 x float> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x float> [[B]], [[LANE]]
+// CHECK-NEXT:    [[SUB:%.*]] = fsub <2 x float> [[A]], [[MUL]]
+// CHECK-NEXT:    ret <2 x float> [[SUB]]
+//
 float32x2_t test_vmls_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
   return vmls_lane_f32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x float> @test_vmlsq_lane_f32_0(<4 x float> noundef %a, <4 x float> noundef %b, <2 x float> noundef %v) #1 {
-// CHECK:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
-// CHECK:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> zeroinitializer
-// CHECK:    [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]]
-// CHECK:    [[SUB:%.*]] = fsub <4 x float> [[A:%.*]], [[MUL]]
-// CHECK:    ret <4 x float> [[SUB]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmlsq_lane_f32_0
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <2 x float> noundef [[V:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x float> [[B]], [[LANE]]
+// CHECK-NEXT:    [[SUB:%.*]] = fsub <4 x float> [[A]], [[MUL]]
+// CHECK-NEXT:    ret <4 x float> [[SUB]]
+//
 float32x4_t test_vmlsq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
   return vmlsq_lane_f32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x float> @test_vmls_laneq_f32_0(<2 x float> noundef %a, <2 x float> noundef %b, <4 x float> noundef %v) #1 {
-// CHECK:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
-// CHECK:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> zeroinitializer
-// CHECK:    [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]]
-// CHECK:    [[SUB:%.*]] = fsub <2 x float> [[A:%.*]], [[MUL]]
-// CHECK:    ret <2 x float> [[SUB]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmls_laneq_f32_0
+// CHECK-SAME: (<2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <4 x float> noundef [[V:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x float> [[B]], [[LANE]]
+// CHECK-NEXT:    [[SUB:%.*]] = fsub <2 x float> [[A]], [[MUL]]
+// CHECK-NEXT:    ret <2 x float> [[SUB]]
+//
 float32x2_t test_vmls_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
   return vmls_laneq_f32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x float> @test_vmlsq_laneq_f32_0(<4 x float> noundef %a, <4 x float> noundef %b, <4 x float> noundef %v) #1 {
-// CHECK:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
-// CHECK:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> zeroinitializer
-// CHECK:    [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]]
-// CHECK:    [[SUB:%.*]] = fsub <4 x float> [[A:%.*]], [[MUL]]
-// CHECK:    ret <4 x float> [[SUB]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmlsq_laneq_f32_0
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <4 x float> noundef [[V:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x float> [[B]], [[LANE]]
+// CHECK-NEXT:    [[SUB:%.*]] = fsub <4 x float> [[A]], [[MUL]]
+// CHECK-NEXT:    ret <4 x float> [[SUB]]
+//
 float32x4_t test_vmlsq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) {
   return vmlsq_laneq_f32(a, b, v, 0);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x float> @test_vmla_lane_f32(<2 x float> noundef %a, <2 x float> noundef %b, <2 x float> noundef %v) #0 {
-// CHECK:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
-// CHECK:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK:    [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]]
-// CHECK:    [[ADD:%.*]] = fadd <2 x float> [[A:%.*]], [[MUL]]
-// CHECK:    ret <2 x float> [[ADD]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmla_lane_f32
+// CHECK-SAME: (<2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <2 x float> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x float> [[B]], [[LANE]]
+// CHECK-NEXT:    [[ADD:%.*]] = fadd <2 x float> [[A]], [[MUL]]
+// CHECK-NEXT:    ret <2 x float> [[ADD]]
+//
 float32x2_t test_vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
   return vmla_lane_f32(a, b, v, 1);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x float> @test_vmlaq_lane_f32(<4 x float> noundef %a, <4 x float> noundef %b, <2 x float> noundef %v) #1 {
-// CHECK:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
-// CHECK:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK:    [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]]
-// CHECK:    [[ADD:%.*]] = fadd <4 x float> [[A:%.*]], [[MUL]]
-// CHECK:    ret <4 x float> [[ADD]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmlaq_lane_f32
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <2 x float> noundef [[V:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x float> [[B]], [[LANE]]
+// CHECK-NEXT:    [[ADD:%.*]] = fadd <4 x float> [[A]], [[MUL]]
+// CHECK-NEXT:    ret <4 x float> [[ADD]]
+//
 float32x4_t test_vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
   return vmlaq_lane_f32(a, b, v, 1);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x float> @test_vmla_laneq_f32(<2 x float> noundef %a, <2 x float> noundef %b, <4 x float> noundef %v) #1 {
-// CHECK:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
-// CHECK:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> <i32 3, i32 3>
-// CHECK:    [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]]
-// CHECK:    [[ADD:%.*]] = fadd <2 x float> [[A:%.*]], [[MUL]]
-// CHECK:    ret <2 x float> [[ADD]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmla_laneq_f32
+// CHECK-SAME: (<2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <4 x float> noundef [[V:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x float> [[B]], [[LANE]]
+// CHECK-NEXT:    [[ADD:%.*]] = fadd <2 x float> [[A]], [[MUL]]
+// CHECK-NEXT:    ret <2 x float> [[ADD]]
+//
 float32x2_t test_vmla_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
   return vmla_laneq_f32(a, b, v, 3);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x float> @test_vmlaq_laneq_f32(<4 x float> noundef %a, <4 x float> noundef %b, <4 x float> noundef %v) #1 {
-// CHECK:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
-// CHECK:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK:    [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]]
-// CHECK:    [[ADD:%.*]] = fadd <4 x float> [[A:%.*]], [[MUL]]
-// CHECK:    ret <4 x float> [[ADD]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmlaq_laneq_f32
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <4 x float> noundef [[V:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x float> [[B]], [[LANE]]
+// CHECK-NEXT:    [[ADD:%.*]] = fadd <4 x float> [[A]], [[MUL]]
+// CHECK-NEXT:    ret <4 x float> [[ADD]]
+//
 float32x4_t test_vmlaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
   return vmlaq_laneq_f32(a, b, v, 3);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x float> @test_vmls_lane_f32(<2 x float> noundef %a, <2 x float> noundef %b, <2 x float> noundef %v) #0 {
-// CHECK:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
-// CHECK:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK:    [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]]
-// CHECK:    [[SUB:%.*]] = fsub <2 x float> [[A:%.*]], [[MUL]]
-// CHECK:    ret <2 x float> [[SUB]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmls_lane_f32
+// CHECK-SAME: (<2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <2 x float> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x float> [[B]], [[LANE]]
+// CHECK-NEXT:    [[SUB:%.*]] = fsub <2 x float> [[A]], [[MUL]]
+// CHECK-NEXT:    ret <2 x float> [[SUB]]
+//
 float32x2_t test_vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
   return vmls_lane_f32(a, b, v, 1);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x float> @test_vmlsq_lane_f32(<4 x float> noundef %a, <4 x float> noundef %b, <2 x float> noundef %v) #1 {
-// CHECK:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
-// CHECK:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
-// CHECK:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK:    [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]]
-// CHECK:    [[SUB:%.*]] = fsub <4 x float> [[A:%.*]], [[MUL]]
-// CHECK:    ret <4 x float> [[SUB]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmlsq_lane_f32
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <2 x float> noundef [[V:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x float> [[B]], [[LANE]]
+// CHECK-NEXT:    [[SUB:%.*]] = fsub <4 x float> [[A]], [[MUL]]
+// CHECK-NEXT:    ret <4 x float> [[SUB]]
 //
 float32x4_t test_vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
   return vmlsq_lane_f32(a, b, v, 1);
 }
-// CHECK-LABEL: define{{.*}} <2 x float> @test_vmls_laneq_f32(<2 x float> noundef %a, <2 x float> noundef %b, <4 x float> noundef %v) #1 {
-// CHECK:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
-// CHECK:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> <i32 3, i32 3>
-// CHECK:    [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]]
-// CHECK:    [[SUB:%.*]] = fsub <2 x float> [[A:%.*]], [[MUL]]
-// CHECK:    ret <2 x float> [[SUB]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmls_laneq_f32
+// CHECK-SAME: (<2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], <4 x float> noundef [[V:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> <i32 3, i32 3>
+// CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x float> [[B]], [[LANE]]
+// CHECK-NEXT:    [[SUB:%.*]] = fsub <2 x float> [[A]], [[MUL]]
+// CHECK-NEXT:    ret <2 x float> [[SUB]]
+//
 float32x2_t test_vmls_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
   return vmls_laneq_f32(a, b, v, 3);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x float> @test_vmlsq_laneq_f32(<4 x float> noundef %a, <4 x float> noundef %b, <4 x float> noundef %v) #1 {
-// CHECK:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
-// CHECK:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
-// CHECK:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK:    [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]]
-// CHECK:    [[SUB:%.*]] = fsub <4 x float> [[A:%.*]], [[MUL]]
-// CHECK:    ret <4 x float> [[SUB]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmlsq_laneq_f32
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], <4 x float> noundef [[V:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x float> [[B]], [[LANE]]
+// CHECK-NEXT:    [[SUB:%.*]] = fsub <4 x float> [[A]], [[MUL]]
+// CHECK-NEXT:    ret <4 x float> [[SUB]]
+//
 float32x4_t test_vmlsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
   return vmlsq_laneq_f32(a, b, v, 3);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x double> @test_vfmaq_n_f64(<2 x double> noundef %a, <2 x double> noundef %b, double noundef %c) #1 {
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double %c, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %c, i32 1
-// CHECK:   [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %b, <2 x double> [[VECINIT1_I]], <2 x double> %a)
-// CHECK:   ret <2 x double> [[TMP6]]
+// CHECK-LABEL: define {{[^@]+}}@test_vfmaq_n_f64
+// CHECK-SAME: (<2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]], double noundef [[C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double [[C]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double [[C]], i32 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[VECINIT1_I]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[B]], <2 x double> [[VECINIT1_I]], <2 x double> [[A]]) #[[ATTR3:[0-9]+]]
+// CHECK-NEXT:    ret <2 x double> [[TMP3]]
+//
 float64x2_t test_vfmaq_n_f64(float64x2_t a, float64x2_t b, float64_t c) {
   return vfmaq_n_f64(a, b, c);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x double> @test_vfmsq_n_f64(<2 x double> noundef %a, <2 x double> noundef %b, double noundef %c) #1 {
-// CHECK:   [[SUB_I:%.*]] = fneg <2 x double> %b
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double %c, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %c, i32 1
-// CHECK:   [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[SUB_I]], <2 x double> [[VECINIT1_I]], <2 x double> %a) #3
-// CHECK:   ret <2 x double> [[TMP6]]
+// CHECK-LABEL: define {{[^@]+}}@test_vfmsq_n_f64
+// CHECK-SAME: (<2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]], double noundef [[C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <2 x double> [[B]]
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double [[C]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double [[C]], i32 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[FNEG_I]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[VECINIT1_I]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FNEG_I]], <2 x double> [[VECINIT1_I]], <2 x double> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <2 x double> [[TMP3]]
+//
 float64x2_t test_vfmsq_n_f64(float64x2_t a, float64x2_t b, float64_t c) {
   return vfmsq_n_f64(a, b, c);
 }
 
-// CHECK: attributes #0 ={{.*}}"min-legal-vector-width"="64"
-// CHECK: attributes #1 ={{.*}}"min-legal-vector-width"="128"

diff  --git a/clang/test/CodeGen/aarch64-neon-tbl.c b/clang/test/CodeGen/aarch64-neon-tbl.c
index 3df2a3d81f3c0..71ac7d425c451 100644
--- a/clang/test/CodeGen/aarch64-neon-tbl.c
+++ b/clang/test/CodeGen/aarch64-neon-tbl.c
@@ -1,3 +1,4 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature
 // RUN: %clang_cc1 -no-opaque-pointers -triple arm64-none-linux-gnu -target-feature +neon \
 // RUN: -disable-O0-optnone  -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
@@ -5,1499 +6,1713 @@
 
 #include <arm_neon.h>
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vtbl1_s8(<8 x i8> noundef %a, <8 x i8> noundef %b) #0 {
-// CHECK:   [[VTBL1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %b) #3
-// CHECK:   ret <8 x i8> [[VTBL11_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vtbl1_s8
+// CHECK-SAME: (<8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VTBL1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> [[B]]) #[[ATTR3:[0-9]+]]
+// CHECK-NEXT:    ret <8 x i8> [[VTBL11_I]]
+//
 int8x8_t test_vtbl1_s8(int8x8_t a, int8x8_t b) {
   return vtbl1_s8(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vqtbl1_s8(<16 x i8> noundef %a, <8 x i8> noundef %b) #1 {
-// CHECK:   [[VTBL1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> %a, <8 x i8> %b) #3
-// CHECK:   ret <8 x i8> [[VTBL1_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vqtbl1_s8
+// CHECK-SAME: (<16 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VTBL1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[A]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <8 x i8> [[VTBL1_I]]
+//
 int8x8_t test_vqtbl1_s8(int8x16_t a, uint8x8_t b) {
   return vqtbl1_s8(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vtbl2_s8([2 x <8 x i8>] %a.coerce, <8 x i8> noundef %b) #0 {
-// CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[A:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[A]], i32 0, i32 0
-// CHECK:   store [2 x <8 x i8>] [[A]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[A]], i32 0, i32 0
-// CHECK:   [[TMP0:%.*]] = load [2 x <8 x i8>], [2 x <8 x i8>]* [[COERCE_DIVE1]], align 8
-// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   store [2 x <8 x i8>] [[TMP0]], [2 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
-// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
-// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
-// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
-// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
-// CHECK:   [[VTBL1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[VTBL13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %b) #3
-// CHECK:   ret <8 x i8> [[VTBL13_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vtbl2_s8
+// CHECK-SAME: ([2 x <8 x i8>] [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P0_I:%.*]] = alloca [[STRUCT_INT8X8X2_T:%.*]], align 8
+// CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_INT8X8X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X2_T]], %struct.int8x8x2_t* [[A]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <8 x i8>] [[A_COERCE]], [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X2_T]], %struct.int8x8x2_t* [[A]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <8 x i8>], [2 x <8 x i8>]* [[COERCE_DIVE1]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X2_T]], %struct.int8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <8 x i8>] [[TMP0]], [2 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
+// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X2_T]], %struct.int8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X2_T]], %struct.int8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK-NEXT:    [[VTBL1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[VTBL13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <8 x i8> [[VTBL13_I]]
+//
 int8x8_t test_vtbl2_s8(int8x8x2_t a, int8x8_t b) {
   return vtbl2_s8(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vqtbl2_s8([2 x <16 x i8>] %a.coerce, <8 x i8> noundef %b) #1 {
-// CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[A:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[A]], i32 0, i32 0
-// CHECK:   store [2 x <16 x i8>] [[A]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[A]], i32 0, i32 0
-// CHECK:   [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE1]], align 16
-// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
-// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
-// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
-// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK:   [[VTBL2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> %b) #3
-// CHECK:   ret <8 x i8> [[VTBL2_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vqtbl2_s8
+// CHECK-SAME: ([2 x <16 x i8>] [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P0_I:%.*]] = alloca [[STRUCT_INT8X16X2_T:%.*]], align 16
+// CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_INT8X16X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X2_T]], %struct.int8x16x2_t* [[A]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <16 x i8>] [[A_COERCE]], [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X2_T]], %struct.int8x16x2_t* [[A]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X2_T]], %struct.int8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X2_T]], %struct.int8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X2_T]], %struct.int8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK-NEXT:    [[VTBL2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <8 x i8> [[VTBL2_I]]
+//
 int8x8_t test_vqtbl2_s8(int8x16x2_t a, uint8x8_t b) {
   return vqtbl2_s8(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vtbl3_s8([3 x <8 x i8>] %a.coerce, <8 x i8> noundef %b) #0 {
-// CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x8x3_t, align 8
-// CHECK:   [[A:%.*]] = alloca %struct.int8x8x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[A]], i32 0, i32 0
-// CHECK:   store [3 x <8 x i8>] [[A]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[A]], i32 0, i32 0
-// CHECK:   [[TMP0:%.*]] = load [3 x <8 x i8>], [3 x <8 x i8>]* [[COERCE_DIVE1]], align 8
-// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   store [3 x <8 x i8>] [[TMP0]], [3 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
-// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
-// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
-// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
-// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
-// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
-// CHECK:   [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> %b) #3
-// CHECK:   ret <8 x i8> [[VTBL26_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vtbl3_s8
+// CHECK-SAME: ([3 x <8 x i8>] [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P0_I:%.*]] = alloca [[STRUCT_INT8X8X3_T:%.*]], align 8
+// CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_INT8X8X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X3_T]], %struct.int8x8x3_t* [[A]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <8 x i8>] [[A_COERCE]], [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X3_T]], %struct.int8x8x3_t* [[A]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <8 x i8>], [3 x <8 x i8>]* [[COERCE_DIVE1]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X3_T]], %struct.int8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <8 x i8>] [[TMP0]], [3 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
+// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X3_T]], %struct.int8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X3_T]], %struct.int8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK-NEXT:    [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X3_T]], %struct.int8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK-NEXT:    [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <8 x i8> [[VTBL26_I]]
+//
 int8x8_t test_vtbl3_s8(int8x8x3_t a, int8x8_t b) {
   return vtbl3_s8(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vqtbl3_s8([3 x <16 x i8>] %a.coerce, <8 x i8> noundef %b) #1 {
-// CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x16x3_t, align 16
-// CHECK:   [[A:%.*]] = alloca %struct.int8x16x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[A]], i32 0, i32 0
-// CHECK:   store [3 x <16 x i8>] [[A]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[A]], i32 0, i32 0
-// CHECK:   [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE1]], align 16
-// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
-// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
-// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
-// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
-// CHECK:   [[VTBL3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> %b) #3
-// CHECK:   ret <8 x i8> [[VTBL3_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vqtbl3_s8
+// CHECK-SAME: ([3 x <16 x i8>] [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P0_I:%.*]] = alloca [[STRUCT_INT8X16X3_T:%.*]], align 16
+// CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_INT8X16X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X3_T]], %struct.int8x16x3_t* [[A]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <16 x i8>] [[A_COERCE]], [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X3_T]], %struct.int8x16x3_t* [[A]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X3_T]], %struct.int8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X3_T]], %struct.int8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X3_T]], %struct.int8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK-NEXT:    [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X3_T]], %struct.int8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK-NEXT:    [[VTBL3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <8 x i8> [[VTBL3_I]]
+//
 int8x8_t test_vqtbl3_s8(int8x16x3_t a, uint8x8_t b) {
   return vqtbl3_s8(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vtbl4_s8([4 x <8 x i8>] %a.coerce, <8 x i8> noundef %b) #0 {
-// CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x8x4_t, align 8
-// CHECK:   [[A:%.*]] = alloca %struct.int8x8x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[A]], i32 0, i32 0
-// CHECK:   store [4 x <8 x i8>] [[A]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[A]], i32 0, i32 0
-// CHECK:   [[TMP0:%.*]] = load [4 x <8 x i8>], [4 x <8 x i8>]* [[COERCE_DIVE1]], align 8
-// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   store [4 x <8 x i8>] [[TMP0]], [4 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
-// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
-// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
-// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
-// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
-// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
-// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i64 0, i64 3
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
-// CHECK:   [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[VTBL27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[VTBL28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL27_I]], <8 x i8> %b) #3
-// CHECK:   ret <8 x i8> [[VTBL28_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vtbl4_s8
+// CHECK-SAME: ([4 x <8 x i8>] [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P0_I:%.*]] = alloca [[STRUCT_INT8X8X4_T:%.*]], align 8
+// CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_INT8X8X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X4_T]], %struct.int8x8x4_t* [[A]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <8 x i8>] [[A_COERCE]], [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X4_T]], %struct.int8x8x4_t* [[A]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <8 x i8>], [4 x <8 x i8>]* [[COERCE_DIVE1]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X4_T]], %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <8 x i8>] [[TMP0]], [4 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
+// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X4_T]], %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X4_T]], %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK-NEXT:    [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X4_T]], %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK-NEXT:    [[VAL5_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X4_T]], %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
+// CHECK-NEXT:    [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[VTBL27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[VTBL28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL27_I]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <8 x i8> [[VTBL28_I]]
+//
 int8x8_t test_vtbl4_s8(int8x8x4_t a, int8x8_t b) {
   return vtbl4_s8(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vqtbl4_s8([4 x <16 x i8>] %a.coerce, <8 x i8> noundef %b) #1 {
-// CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x16x4_t, align 16
-// CHECK:   [[A:%.*]] = alloca %struct.int8x16x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[A]], i32 0, i32 0
-// CHECK:   store [4 x <16 x i8>] [[A]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[A]], i32 0, i32 0
-// CHECK:   [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE1]], align 16
-// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
-// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
-// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
-// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
-// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
-// CHECK:   [[VTBL4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> %b) #3
-// CHECK:   ret <8 x i8> [[VTBL4_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vqtbl4_s8
+// CHECK-SAME: ([4 x <16 x i8>] [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P0_I:%.*]] = alloca [[STRUCT_INT8X16X4_T:%.*]], align 16
+// CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_INT8X16X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], %struct.int8x16x4_t* [[A]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <16 x i8>] [[A_COERCE]], [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], %struct.int8x16x4_t* [[A]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK-NEXT:    [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK-NEXT:    [[VAL5_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
+// CHECK-NEXT:    [[VTBL4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <8 x i8> [[VTBL4_I]]
+//
 int8x8_t test_vqtbl4_s8(int8x16x4_t a, uint8x8_t b) {
   return vqtbl4_s8(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} <16 x i8> @test_vqtbl1q_s8(<16 x i8> noundef %a, <16 x i8> noundef %b) #1 {
-// CHECK:   [[VTBL1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> %b) #3
-// CHECK:   ret <16 x i8> [[VTBL1_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vqtbl1q_s8
+// CHECK-SAME: (<16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VTBL1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <16 x i8> [[VTBL1_I]]
+//
 int8x16_t test_vqtbl1q_s8(int8x16_t a, int8x16_t b) {
   return vqtbl1q_s8(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} <16 x i8> @test_vqtbl2q_s8([2 x <16 x i8>] %a.coerce, <16 x i8> noundef %b) #1 {
-// CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[A:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[A]], i32 0, i32 0
-// CHECK:   store [2 x <16 x i8>] [[A]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[A]], i32 0, i32 0
-// CHECK:   [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE1]], align 16
-// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
-// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
-// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
-// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK:   [[VTBL2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> %b) #3
-// CHECK:   ret <16 x i8> [[VTBL2_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vqtbl2q_s8
+// CHECK-SAME: ([2 x <16 x i8>] [[A_COERCE:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P0_I:%.*]] = alloca [[STRUCT_INT8X16X2_T:%.*]], align 16
+// CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_INT8X16X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X2_T]], %struct.int8x16x2_t* [[A]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <16 x i8>] [[A_COERCE]], [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X2_T]], %struct.int8x16x2_t* [[A]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X2_T]], %struct.int8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X2_T]], %struct.int8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X2_T]], %struct.int8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK-NEXT:    [[VTBL2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <16 x i8> [[VTBL2_I]]
+//
 int8x16_t test_vqtbl2q_s8(int8x16x2_t a, int8x16_t b) {
   return vqtbl2q_s8(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} <16 x i8> @test_vqtbl3q_s8([3 x <16 x i8>] %a.coerce, <16 x i8> noundef %b) #1 {
-// CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x16x3_t, align 16
-// CHECK:   [[A:%.*]] = alloca %struct.int8x16x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[A]], i32 0, i32 0
-// CHECK:   store [3 x <16 x i8>] [[A]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[A]], i32 0, i32 0
-// CHECK:   [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE1]], align 16
-// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
-// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
-// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
-// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
-// CHECK:   [[VTBL3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> %b) #3
-// CHECK:   ret <16 x i8> [[VTBL3_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vqtbl3q_s8
+// CHECK-SAME: ([3 x <16 x i8>] [[A_COERCE:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P0_I:%.*]] = alloca [[STRUCT_INT8X16X3_T:%.*]], align 16
+// CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_INT8X16X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X3_T]], %struct.int8x16x3_t* [[A]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <16 x i8>] [[A_COERCE]], [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X3_T]], %struct.int8x16x3_t* [[A]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X3_T]], %struct.int8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X3_T]], %struct.int8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X3_T]], %struct.int8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK-NEXT:    [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X3_T]], %struct.int8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK-NEXT:    [[VTBL3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <16 x i8> [[VTBL3_I]]
+//
 int8x16_t test_vqtbl3q_s8(int8x16x3_t a, int8x16_t b) {
   return vqtbl3q_s8(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} <16 x i8> @test_vqtbl4q_s8([4 x <16 x i8>] %a.coerce, <16 x i8> noundef %b) #1 {
-// CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x16x4_t, align 16
-// CHECK:   [[A:%.*]] = alloca %struct.int8x16x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[A]], i32 0, i32 0
-// CHECK:   store [4 x <16 x i8>] [[A]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[A]], i32 0, i32 0
-// CHECK:   [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE1]], align 16
-// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
-// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
-// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
-// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
-// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
-// CHECK:   [[VTBL4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> %b) #3
-// CHECK:   ret <16 x i8> [[VTBL4_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vqtbl4q_s8
+// CHECK-SAME: ([4 x <16 x i8>] [[A_COERCE:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P0_I:%.*]] = alloca [[STRUCT_INT8X16X4_T:%.*]], align 16
+// CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_INT8X16X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], %struct.int8x16x4_t* [[A]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <16 x i8>] [[A_COERCE]], [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], %struct.int8x16x4_t* [[A]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK-NEXT:    [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK-NEXT:    [[VAL5_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
+// CHECK-NEXT:    [[VTBL4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <16 x i8> [[VTBL4_I]]
+//
 int8x16_t test_vqtbl4q_s8(int8x16x4_t a, int8x16_t b) {
   return vqtbl4q_s8(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vtbx1_s8(<8 x i8> noundef %a, <8 x i8> noundef %b, <8 x i8> noundef %c) #0 {
-// CHECK:   [[VTBL1_I:%.*]] = shufflevector <8 x i8> %b, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %c) #3
-// CHECK:   [[TMP0:%.*]] = icmp uge <8 x i8> %c, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
-// CHECK:   [[TMP1:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = and <8 x i8> [[TMP1]], %a
-// CHECK:   [[TMP3:%.*]] = xor <8 x i8> [[TMP1]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
-// CHECK:   [[TMP4:%.*]] = and <8 x i8> [[TMP3]], [[VTBL11_I]]
-// CHECK:   [[VTBX_I:%.*]] = or <8 x i8> [[TMP2]], [[TMP4]]
-// CHECK:   ret <8 x i8> [[VTBX_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vtbx1_s8
+// CHECK-SAME: (<8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VTBL1_I:%.*]] = shufflevector <8 x i8> [[B]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp uge <8 x i8> [[C]], <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
+// CHECK-NEXT:    [[TMP1:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = and <8 x i8> [[TMP1]], [[A]]
+// CHECK-NEXT:    [[TMP3:%.*]] = xor <8 x i8> [[TMP1]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK-NEXT:    [[TMP4:%.*]] = and <8 x i8> [[TMP3]], [[VTBL11_I]]
+// CHECK-NEXT:    [[VTBX_I:%.*]] = or <8 x i8> [[TMP2]], [[TMP4]]
+// CHECK-NEXT:    ret <8 x i8> [[VTBX_I]]
+//
 int8x8_t test_vtbx1_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
   return vtbx1_s8(a, b, c);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vtbx2_s8(<8 x i8> noundef %a, [2 x <8 x i8>] %b.coerce, <8 x i8> noundef %c) #0 {
-// CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
-// CHECK:   [[TMP0:%.*]] = load [2 x <8 x i8>], [2 x <8 x i8>]* [[COERCE_DIVE1]], align 8
-// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   store [2 x <8 x i8>] [[TMP0]], [2 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
-// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
-// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
-// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
-// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
-// CHECK:   [[VTBX1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[VTBX13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %a, <16 x i8> [[VTBX1_I]], <8 x i8> %c) #3
-// CHECK:   ret <8 x i8> [[VTBX13_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vtbx2_s8
+// CHECK-SAME: (<8 x i8> noundef [[A:%.*]], [2 x <8 x i8>] [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P1_I:%.*]] = alloca [[STRUCT_INT8X8X2_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X8X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X2_T]], %struct.int8x8x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <8 x i8>] [[B_COERCE]], [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X2_T]], %struct.int8x8x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <8 x i8>], [2 x <8 x i8>]* [[COERCE_DIVE1]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X2_T]], %struct.int8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <8 x i8>] [[TMP0]], [2 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
+// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X2_T]], %struct.int8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X2_T]], %struct.int8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK-NEXT:    [[VTBX1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[VTBX13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[A]], <16 x i8> [[VTBX1_I]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <8 x i8> [[VTBX13_I]]
+//
 int8x8_t test_vtbx2_s8(int8x8_t a, int8x8x2_t b, int8x8_t c) {
   return vtbx2_s8(a, b, c);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vtbx3_s8(<8 x i8> noundef %a, [3 x <8 x i8>] %b.coerce, <8 x i8> noundef %c) #0 {
-// CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x8x3_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
-// CHECK:   [[TMP0:%.*]] = load [3 x <8 x i8>], [3 x <8 x i8>]* [[COERCE_DIVE1]], align 8
-// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   store [3 x <8 x i8>] [[TMP0]], [3 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
-// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
-// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
-// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
-// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
-// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
-// CHECK:   [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> %c) #3
-// CHECK:   [[TMP4:%.*]] = icmp uge <8 x i8> %c, <i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24>
-// CHECK:   [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i8>
-// CHECK:   [[TMP6:%.*]] = and <8 x i8> [[TMP5]], %a
-// CHECK:   [[TMP7:%.*]] = xor <8 x i8> [[TMP5]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
-// CHECK:   [[TMP8:%.*]] = and <8 x i8> [[TMP7]], [[VTBL26_I]]
-// CHECK:   [[VTBX_I:%.*]] = or <8 x i8> [[TMP6]], [[TMP8]]
-// CHECK:   ret <8 x i8> [[VTBX_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vtbx3_s8
+// CHECK-SAME: (<8 x i8> noundef [[A:%.*]], [3 x <8 x i8>] [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P1_I:%.*]] = alloca [[STRUCT_INT8X8X3_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X8X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X3_T]], %struct.int8x8x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <8 x i8>] [[B_COERCE]], [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X3_T]], %struct.int8x8x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <8 x i8>], [3 x <8 x i8>]* [[COERCE_DIVE1]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X3_T]], %struct.int8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <8 x i8>] [[TMP0]], [3 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
+// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X3_T]], %struct.int8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X3_T]], %struct.int8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK-NEXT:    [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X3_T]], %struct.int8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK-NEXT:    [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP4:%.*]] = icmp uge <8 x i8> [[C]], <i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24>
+// CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = and <8 x i8> [[TMP5]], [[A]]
+// CHECK-NEXT:    [[TMP7:%.*]] = xor <8 x i8> [[TMP5]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK-NEXT:    [[TMP8:%.*]] = and <8 x i8> [[TMP7]], [[VTBL26_I]]
+// CHECK-NEXT:    [[VTBX_I:%.*]] = or <8 x i8> [[TMP6]], [[TMP8]]
+// CHECK-NEXT:    ret <8 x i8> [[VTBX_I]]
+//
 int8x8_t test_vtbx3_s8(int8x8_t a, int8x8x3_t b, int8x8_t c) {
   return vtbx3_s8(a, b, c);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vtbx4_s8(<8 x i8> noundef %a, [4 x <8 x i8>] %b.coerce, <8 x i8> noundef %c) #0 {
-// CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x8x4_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
-// CHECK:   [[TMP0:%.*]] = load [4 x <8 x i8>], [4 x <8 x i8>]* [[COERCE_DIVE1]], align 8
-// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   store [4 x <8 x i8>] [[TMP0]], [4 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
-// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
-// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
-// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
-// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
-// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
-// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i64 0, i64 3
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
-// CHECK:   [[VTBX2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[VTBX27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[VTBX28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> %a, <16 x i8> [[VTBX2_I]], <16 x i8> [[VTBX27_I]], <8 x i8> %c) #3
-// CHECK:   ret <8 x i8> [[VTBX28_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vtbx4_s8
+// CHECK-SAME: (<8 x i8> noundef [[A:%.*]], [4 x <8 x i8>] [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P1_I:%.*]] = alloca [[STRUCT_INT8X8X4_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X8X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X4_T]], %struct.int8x8x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <8 x i8>] [[B_COERCE]], [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X4_T]], %struct.int8x8x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <8 x i8>], [4 x <8 x i8>]* [[COERCE_DIVE1]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X4_T]], %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <8 x i8>] [[TMP0]], [4 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
+// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X4_T]], %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X4_T]], %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK-NEXT:    [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X4_T]], %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK-NEXT:    [[VAL5_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X4_T]], %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
+// CHECK-NEXT:    [[VTBX2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[VTBX27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[VTBX28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> [[A]], <16 x i8> [[VTBX2_I]], <16 x i8> [[VTBX27_I]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <8 x i8> [[VTBX28_I]]
+//
 int8x8_t test_vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c) {
   return vtbx4_s8(a, b, c);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vqtbx1_s8(<8 x i8> noundef %a, <16 x i8> noundef %b, <8 x i8> noundef %c) #1 {
-// CHECK:   [[VTBX1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) #3
-// CHECK:   ret <8 x i8> [[VTBX1_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vqtbx1_s8
+// CHECK-SAME: (<8 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VTBX1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[A]], <16 x i8> [[B]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <8 x i8> [[VTBX1_I]]
+//
 int8x8_t test_vqtbx1_s8(int8x8_t a, int8x16_t b, uint8x8_t c) {
   return vqtbx1_s8(a, b, c);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vqtbx2_s8(<8 x i8> noundef %a, [2 x <16 x i8>] %b.coerce, <8 x i8> noundef %c) #1 {
-// CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[B]], i32 0, i32 0
-// CHECK:   [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE1]], align 16
-// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
-// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
-// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
-// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK:   [[VTBX2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> %c) #3
-// CHECK:   ret <8 x i8> [[VTBX2_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vqtbx2_s8
+// CHECK-SAME: (<8 x i8> noundef [[A:%.*]], [2 x <16 x i8>] [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P1_I:%.*]] = alloca [[STRUCT_INT8X16X2_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X16X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X2_T]], %struct.int8x16x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <16 x i8>] [[B_COERCE]], [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X2_T]], %struct.int8x16x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X2_T]], %struct.int8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X2_T]], %struct.int8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X2_T]], %struct.int8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK-NEXT:    [[VTBX2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <8 x i8> [[VTBX2_I]]
+//
 int8x8_t test_vqtbx2_s8(int8x8_t a, int8x16x2_t b, uint8x8_t c) {
   return vqtbx2_s8(a, b, c);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vqtbx3_s8(<8 x i8> noundef %a, [3 x <16 x i8>] %b.coerce, <8 x i8> noundef %c) #1 {
-// CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x16x3_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.int8x16x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[B]], i32 0, i32 0
-// CHECK:   [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE1]], align 16
-// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
-// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
-// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
-// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
-// CHECK:   [[VTBX3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> %c) #3
-// CHECK:   ret <8 x i8> [[VTBX3_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vqtbx3_s8
+// CHECK-SAME: (<8 x i8> noundef [[A:%.*]], [3 x <16 x i8>] [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P1_I:%.*]] = alloca [[STRUCT_INT8X16X3_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X16X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X3_T]], %struct.int8x16x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <16 x i8>] [[B_COERCE]], [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X3_T]], %struct.int8x16x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X3_T]], %struct.int8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X3_T]], %struct.int8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X3_T]], %struct.int8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK-NEXT:    [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X3_T]], %struct.int8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK-NEXT:    [[VTBX3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <8 x i8> [[VTBX3_I]]
+//
 int8x8_t test_vqtbx3_s8(int8x8_t a, int8x16x3_t b, uint8x8_t c) {
   return vqtbx3_s8(a, b, c);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vqtbx4_s8(<8 x i8> noundef %a, [4 x <16 x i8>] %b.coerce, <8 x i8> noundef %c) #1 {
-// CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x16x4_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.int8x16x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0
-// CHECK:   [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE1]], align 16
-// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
-// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
-// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
-// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
-// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
-// CHECK:   [[VTBX4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> %c) #3
-// CHECK:   ret <8 x i8> [[VTBX4_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vqtbx4_s8
+// CHECK-SAME: (<8 x i8> noundef [[A:%.*]], [4 x <16 x i8>] [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P1_I:%.*]] = alloca [[STRUCT_INT8X16X4_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X16X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], %struct.int8x16x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <16 x i8>] [[B_COERCE]], [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], %struct.int8x16x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK-NEXT:    [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK-NEXT:    [[VAL5_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
+// CHECK-NEXT:    [[VTBX4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <8 x i8> [[VTBX4_I]]
+//
 int8x8_t test_vqtbx4_s8(int8x8_t a, int8x16x4_t b, uint8x8_t c) {
   return vqtbx4_s8(a, b, c);
 }
 
-// CHECK-LABEL: define{{.*}} <16 x i8> @test_vqtbx1q_s8(<16 x i8> noundef %a, <16 x i8> noundef %b, <16 x i8> noundef %c) #1 {
-// CHECK:   [[VTBX1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #3
-// CHECK:   ret <16 x i8> [[VTBX1_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vqtbx1q_s8
+// CHECK-SAME: (<16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VTBX1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <16 x i8> [[VTBX1_I]]
+//
 int8x16_t test_vqtbx1q_s8(int8x16_t a, int8x16_t b, uint8x16_t c) {
   return vqtbx1q_s8(a, b, c);
 }
 
-// CHECK-LABEL: define{{.*}} <16 x i8> @test_vqtbx2q_s8(<16 x i8> noundef %a, [2 x <16 x i8>] %b.coerce, <16 x i8> noundef %c) #1 {
-// CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[B]], i32 0, i32 0
-// CHECK:   [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE1]], align 16
-// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
-// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
-// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
-// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK:   [[VTBX2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> %c) #3
-// CHECK:   ret <16 x i8> [[VTBX2_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vqtbx2q_s8
+// CHECK-SAME: (<16 x i8> noundef [[A:%.*]], [2 x <16 x i8>] [[B_COERCE:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P1_I:%.*]] = alloca [[STRUCT_INT8X16X2_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X16X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X2_T]], %struct.int8x16x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <16 x i8>] [[B_COERCE]], [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X2_T]], %struct.int8x16x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X2_T]], %struct.int8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X2_T]], %struct.int8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X2_T]], %struct.int8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK-NEXT:    [[VTBX2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <16 x i8> [[VTBX2_I]]
+//
 int8x16_t test_vqtbx2q_s8(int8x16_t a, int8x16x2_t b, int8x16_t c) {
   return vqtbx2q_s8(a, b, c);
 }
 
-// CHECK-LABEL: define{{.*}} <16 x i8> @test_vqtbx3q_s8(<16 x i8> noundef %a, [3 x <16 x i8>] %b.coerce, <16 x i8> noundef %c) #1 {
-// CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x16x3_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.int8x16x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[B]], i32 0, i32 0
-// CHECK:   [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE1]], align 16
-// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
-// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
-// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
-// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
-// CHECK:   [[VTBX3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> %c) #3
-// CHECK:   ret <16 x i8> [[VTBX3_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vqtbx3q_s8
+// CHECK-SAME: (<16 x i8> noundef [[A:%.*]], [3 x <16 x i8>] [[B_COERCE:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P1_I:%.*]] = alloca [[STRUCT_INT8X16X3_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X16X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X3_T]], %struct.int8x16x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <16 x i8>] [[B_COERCE]], [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X3_T]], %struct.int8x16x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X3_T]], %struct.int8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X3_T]], %struct.int8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X3_T]], %struct.int8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK-NEXT:    [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X3_T]], %struct.int8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK-NEXT:    [[VTBX3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <16 x i8> [[VTBX3_I]]
+//
 int8x16_t test_vqtbx3q_s8(int8x16_t a, int8x16x3_t b, int8x16_t c) {
   return vqtbx3q_s8(a, b, c);
 }
 
-// CHECK-LABEL: define{{.*}} <16 x i8> @test_vqtbx4q_s8(<16 x i8> noundef %a, [4 x <16 x i8>] %b.coerce, <16 x i8> noundef %c) #1 {
-// CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x16x4_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.int8x16x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0
-// CHECK:   [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE1]], align 16
-// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
-// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
-// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
-// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
-// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
-// CHECK:   [[VTBX4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> %c) #3
-// CHECK:   ret <16 x i8> [[VTBX4_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vqtbx4q_s8
+// CHECK-SAME: (<16 x i8> noundef [[A:%.*]], [4 x <16 x i8>] [[B_COERCE:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P1_I:%.*]] = alloca [[STRUCT_INT8X16X4_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X16X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], %struct.int8x16x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <16 x i8>] [[B_COERCE]], [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], %struct.int8x16x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK-NEXT:    [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK-NEXT:    [[VAL5_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
+// CHECK-NEXT:    [[VTBX4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <16 x i8> [[VTBX4_I]]
+//
 int8x16_t test_vqtbx4q_s8(int8x16_t a, int8x16x4_t b, int8x16_t c) {
   return vqtbx4q_s8(a, b, c);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vtbl1_u8(<8 x i8> noundef %a, <8 x i8> noundef %b) #0 {
-// CHECK:   [[VTBL1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %b) #3
-// CHECK:   ret <8 x i8> [[VTBL11_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vtbl1_u8
+// CHECK-SAME: (<8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VTBL1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <8 x i8> [[VTBL11_I]]
+//
 uint8x8_t test_vtbl1_u8(uint8x8_t a, uint8x8_t b) {
   return vtbl1_u8(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vqtbl1_u8(<16 x i8> noundef %a, <8 x i8> noundef %b) #1 {
-// CHECK:   [[VTBL1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> %a, <8 x i8> %b) #3
-// CHECK:   ret <8 x i8> [[VTBL1_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vqtbl1_u8
+// CHECK-SAME: (<16 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VTBL1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[A]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <8 x i8> [[VTBL1_I]]
+//
 uint8x8_t test_vqtbl1_u8(uint8x16_t a, uint8x8_t b) {
   return vqtbl1_u8(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vtbl2_u8([2 x <8 x i8>] %a.coerce, <8 x i8> noundef %b) #0 {
-// CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[A:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[A]], i32 0, i32 0
-// CHECK:   store [2 x <8 x i8>] [[A]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[A]], i32 0, i32 0
-// CHECK:   [[TMP0:%.*]] = load [2 x <8 x i8>], [2 x <8 x i8>]* [[COERCE_DIVE1]], align 8
-// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   store [2 x <8 x i8>] [[TMP0]], [2 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
-// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
-// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
-// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
-// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
-// CHECK:   [[VTBL1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[VTBL13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %b) #3
-// CHECK:   ret <8 x i8> [[VTBL13_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vtbl2_u8
+// CHECK-SAME: ([2 x <8 x i8>] [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P0_I:%.*]] = alloca [[STRUCT_UINT8X8X2_T:%.*]], align 8
+// CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_UINT8X8X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X2_T]], %struct.uint8x8x2_t* [[A]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <8 x i8>] [[A_COERCE]], [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X2_T]], %struct.uint8x8x2_t* [[A]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <8 x i8>], [2 x <8 x i8>]* [[COERCE_DIVE1]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X2_T]], %struct.uint8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <8 x i8>] [[TMP0]], [2 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
+// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X2_T]], %struct.uint8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X2_T]], %struct.uint8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK-NEXT:    [[VTBL1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[VTBL13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <8 x i8> [[VTBL13_I]]
+//
 uint8x8_t test_vtbl2_u8(uint8x8x2_t a, uint8x8_t b) {
   return vtbl2_u8(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vqtbl2_u8([2 x <16 x i8>] %a.coerce, <8 x i8> noundef %b) #1 {
-// CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[A:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[A]], i32 0, i32 0
-// CHECK:   store [2 x <16 x i8>] [[A]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[A]], i32 0, i32 0
-// CHECK:   [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE1]], align 16
-// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
-// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
-// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
-// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK:   [[VTBL2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> %b) #3
-// CHECK:   ret <8 x i8> [[VTBL2_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vqtbl2_u8
+// CHECK-SAME: ([2 x <16 x i8>] [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P0_I:%.*]] = alloca [[STRUCT_UINT8X16X2_T:%.*]], align 16
+// CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_UINT8X16X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X2_T]], %struct.uint8x16x2_t* [[A]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <16 x i8>] [[A_COERCE]], [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X2_T]], %struct.uint8x16x2_t* [[A]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X2_T]], %struct.uint8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X2_T]], %struct.uint8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X2_T]], %struct.uint8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK-NEXT:    [[VTBL2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <8 x i8> [[VTBL2_I]]
+//
 uint8x8_t test_vqtbl2_u8(uint8x16x2_t a, uint8x8_t b) {
   return vqtbl2_u8(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vtbl3_u8([3 x <8 x i8>] %a.coerce, <8 x i8> noundef %b) #0 {
-// CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x8x3_t, align 8
-// CHECK:   [[A:%.*]] = alloca %struct.uint8x8x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[A]], i32 0, i32 0
-// CHECK:   store [3 x <8 x i8>] [[A]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[A]], i32 0, i32 0
-// CHECK:   [[TMP0:%.*]] = load [3 x <8 x i8>], [3 x <8 x i8>]* [[COERCE_DIVE1]], align 8
-// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   store [3 x <8 x i8>] [[TMP0]], [3 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
-// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
-// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
-// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
-// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
-// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
-// CHECK:   [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> %b) #3
-// CHECK:   ret <8 x i8> [[VTBL26_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vtbl3_u8
+// CHECK-SAME: ([3 x <8 x i8>] [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P0_I:%.*]] = alloca [[STRUCT_UINT8X8X3_T:%.*]], align 8
+// CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_UINT8X8X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X3_T]], %struct.uint8x8x3_t* [[A]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <8 x i8>] [[A_COERCE]], [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X3_T]], %struct.uint8x8x3_t* [[A]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <8 x i8>], [3 x <8 x i8>]* [[COERCE_DIVE1]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X3_T]], %struct.uint8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <8 x i8>] [[TMP0]], [3 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
+// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X3_T]], %struct.uint8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X3_T]], %struct.uint8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK-NEXT:    [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X3_T]], %struct.uint8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK-NEXT:    [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <8 x i8> [[VTBL26_I]]
+//
 uint8x8_t test_vtbl3_u8(uint8x8x3_t a, uint8x8_t b) {
   return vtbl3_u8(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vqtbl3_u8([3 x <16 x i8>] %a.coerce, <8 x i8> noundef %b) #1 {
-// CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x16x3_t, align 16
-// CHECK:   [[A:%.*]] = alloca %struct.uint8x16x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[A]], i32 0, i32 0
-// CHECK:   store [3 x <16 x i8>] [[A]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[A]], i32 0, i32 0
-// CHECK:   [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE1]], align 16
-// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
-// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
-// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
-// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
-// CHECK:   [[VTBL3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> %b) #3
-// CHECK:   ret <8 x i8> [[VTBL3_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vqtbl3_u8
+// CHECK-SAME: ([3 x <16 x i8>] [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P0_I:%.*]] = alloca [[STRUCT_UINT8X16X3_T:%.*]], align 16
+// CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_UINT8X16X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X3_T]], %struct.uint8x16x3_t* [[A]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <16 x i8>] [[A_COERCE]], [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X3_T]], %struct.uint8x16x3_t* [[A]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X3_T]], %struct.uint8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X3_T]], %struct.uint8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X3_T]], %struct.uint8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK-NEXT:    [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X3_T]], %struct.uint8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK-NEXT:    [[VTBL3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <8 x i8> [[VTBL3_I]]
+//
 uint8x8_t test_vqtbl3_u8(uint8x16x3_t a, uint8x8_t b) {
   return vqtbl3_u8(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vtbl4_u8([4 x <8 x i8>] %a.coerce, <8 x i8> noundef %b) #0 {
-// CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x8x4_t, align 8
-// CHECK:   [[A:%.*]] = alloca %struct.uint8x8x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[A]], i32 0, i32 0
-// CHECK:   store [4 x <8 x i8>] [[A]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[A]], i32 0, i32 0
-// CHECK:   [[TMP0:%.*]] = load [4 x <8 x i8>], [4 x <8 x i8>]* [[COERCE_DIVE1]], align 8
-// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   store [4 x <8 x i8>] [[TMP0]], [4 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
-// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
-// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
-// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
-// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
-// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
-// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i64 0, i64 3
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
-// CHECK:   [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[VTBL27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[VTBL28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL27_I]], <8 x i8> %b) #3
-// CHECK:   ret <8 x i8> [[VTBL28_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vtbl4_u8
+// CHECK-SAME: ([4 x <8 x i8>] [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P0_I:%.*]] = alloca [[STRUCT_UINT8X8X4_T:%.*]], align 8
+// CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_UINT8X8X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X4_T]], %struct.uint8x8x4_t* [[A]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <8 x i8>] [[A_COERCE]], [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X4_T]], %struct.uint8x8x4_t* [[A]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <8 x i8>], [4 x <8 x i8>]* [[COERCE_DIVE1]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X4_T]], %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <8 x i8>] [[TMP0]], [4 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
+// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X4_T]], %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X4_T]], %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK-NEXT:    [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X4_T]], %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK-NEXT:    [[VAL5_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X4_T]], %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
+// CHECK-NEXT:    [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[VTBL27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[VTBL28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL27_I]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <8 x i8> [[VTBL28_I]]
+//
 uint8x8_t test_vtbl4_u8(uint8x8x4_t a, uint8x8_t b) {
   return vtbl4_u8(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vqtbl4_u8([4 x <16 x i8>] %a.coerce, <8 x i8> noundef %b) #1 {
-// CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x16x4_t, align 16
-// CHECK:   [[A:%.*]] = alloca %struct.uint8x16x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[A]], i32 0, i32 0
-// CHECK:   store [4 x <16 x i8>] [[A]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[A]], i32 0, i32 0
-// CHECK:   [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE1]], align 16
-// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
-// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
-// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
-// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
-// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
-// CHECK:   [[VTBL4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> %b) #3
-// CHECK:   ret <8 x i8> [[VTBL4_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vqtbl4_u8
+// CHECK-SAME: ([4 x <16 x i8>] [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P0_I:%.*]] = alloca [[STRUCT_UINT8X16X4_T:%.*]], align 16
+// CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_UINT8X16X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], %struct.uint8x16x4_t* [[A]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <16 x i8>] [[A_COERCE]], [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], %struct.uint8x16x4_t* [[A]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK-NEXT:    [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK-NEXT:    [[VAL5_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
+// CHECK-NEXT:    [[VTBL4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <8 x i8> [[VTBL4_I]]
+//
 uint8x8_t test_vqtbl4_u8(uint8x16x4_t a, uint8x8_t b) {
   return vqtbl4_u8(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} <16 x i8> @test_vqtbl1q_u8(<16 x i8> noundef %a, <16 x i8> noundef %b) #1 {
-// CHECK:   [[VTBL1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> %b) #3
-// CHECK:   ret <16 x i8> [[VTBL1_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vqtbl1q_u8
+// CHECK-SAME: (<16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VTBL1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <16 x i8> [[VTBL1_I]]
+//
 uint8x16_t test_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
   return vqtbl1q_u8(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} <16 x i8> @test_vqtbl2q_u8([2 x <16 x i8>] %a.coerce, <16 x i8> noundef %b) #1 {
-// CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[A:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[A]], i32 0, i32 0
-// CHECK:   store [2 x <16 x i8>] [[A]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[A]], i32 0, i32 0
-// CHECK:   [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE1]], align 16
-// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
-// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
-// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
-// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK:   [[VTBL2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> %b) #3
-// CHECK:   ret <16 x i8> [[VTBL2_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vqtbl2q_u8
+// CHECK-SAME: ([2 x <16 x i8>] [[A_COERCE:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P0_I:%.*]] = alloca [[STRUCT_UINT8X16X2_T:%.*]], align 16
+// CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_UINT8X16X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X2_T]], %struct.uint8x16x2_t* [[A]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <16 x i8>] [[A_COERCE]], [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X2_T]], %struct.uint8x16x2_t* [[A]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X2_T]], %struct.uint8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X2_T]], %struct.uint8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X2_T]], %struct.uint8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK-NEXT:    [[VTBL2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <16 x i8> [[VTBL2_I]]
+//
 uint8x16_t test_vqtbl2q_u8(uint8x16x2_t a, uint8x16_t b) {
   return vqtbl2q_u8(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} <16 x i8> @test_vqtbl3q_u8([3 x <16 x i8>] %a.coerce, <16 x i8> noundef %b) #1 {
-// CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x16x3_t, align 16
-// CHECK:   [[A:%.*]] = alloca %struct.uint8x16x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[A]], i32 0, i32 0
-// CHECK:   store [3 x <16 x i8>] [[A]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[A]], i32 0, i32 0
-// CHECK:   [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE1]], align 16
-// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
-// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
-// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
-// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
-// CHECK:   [[VTBL3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> %b) #3
-// CHECK:   ret <16 x i8> [[VTBL3_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vqtbl3q_u8
+// CHECK-SAME: ([3 x <16 x i8>] [[A_COERCE:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P0_I:%.*]] = alloca [[STRUCT_UINT8X16X3_T:%.*]], align 16
+// CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_UINT8X16X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X3_T]], %struct.uint8x16x3_t* [[A]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <16 x i8>] [[A_COERCE]], [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X3_T]], %struct.uint8x16x3_t* [[A]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X3_T]], %struct.uint8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X3_T]], %struct.uint8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X3_T]], %struct.uint8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK-NEXT:    [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X3_T]], %struct.uint8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK-NEXT:    [[VTBL3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <16 x i8> [[VTBL3_I]]
+//
 uint8x16_t test_vqtbl3q_u8(uint8x16x3_t a, uint8x16_t b) {
   return vqtbl3q_u8(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} <16 x i8> @test_vqtbl4q_u8([4 x <16 x i8>] %a.coerce, <16 x i8> noundef %b) #1 {
-// CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x16x4_t, align 16
-// CHECK:   [[A:%.*]] = alloca %struct.uint8x16x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[A]], i32 0, i32 0
-// CHECK:   store [4 x <16 x i8>] [[A]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[A]], i32 0, i32 0
-// CHECK:   [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE1]], align 16
-// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
-// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
-// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
-// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
-// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
-// CHECK:   [[VTBL4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> %b) #3
-// CHECK:   ret <16 x i8> [[VTBL4_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vqtbl4q_u8
+// CHECK-SAME: ([4 x <16 x i8>] [[A_COERCE:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P0_I:%.*]] = alloca [[STRUCT_UINT8X16X4_T:%.*]], align 16
+// CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_UINT8X16X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], %struct.uint8x16x4_t* [[A]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <16 x i8>] [[A_COERCE]], [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], %struct.uint8x16x4_t* [[A]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK-NEXT:    [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK-NEXT:    [[VAL5_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
+// CHECK-NEXT:    [[VTBL4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <16 x i8> [[VTBL4_I]]
+//
 uint8x16_t test_vqtbl4q_u8(uint8x16x4_t a, uint8x16_t b) {
   return vqtbl4q_u8(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vtbx1_u8(<8 x i8> noundef %a, <8 x i8> noundef %b, <8 x i8> noundef %c) #0 {
-// CHECK:   [[VTBL1_I:%.*]] = shufflevector <8 x i8> %b, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %c) #3
-// CHECK:   [[TMP0:%.*]] = icmp uge <8 x i8> %c, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
-// CHECK:   [[TMP1:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = and <8 x i8> [[TMP1]], %a
-// CHECK:   [[TMP3:%.*]] = xor <8 x i8> [[TMP1]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
-// CHECK:   [[TMP4:%.*]] = and <8 x i8> [[TMP3]], [[VTBL11_I]]
-// CHECK:   [[VTBX_I:%.*]] = or <8 x i8> [[TMP2]], [[TMP4]]
-// CHECK:   ret <8 x i8> [[VTBX_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vtbx1_u8
+// CHECK-SAME: (<8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VTBL1_I:%.*]] = shufflevector <8 x i8> [[B]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp uge <8 x i8> [[C]], <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
+// CHECK-NEXT:    [[TMP1:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = and <8 x i8> [[TMP1]], [[A]]
+// CHECK-NEXT:    [[TMP3:%.*]] = xor <8 x i8> [[TMP1]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK-NEXT:    [[TMP4:%.*]] = and <8 x i8> [[TMP3]], [[VTBL11_I]]
+// CHECK-NEXT:    [[VTBX_I:%.*]] = or <8 x i8> [[TMP2]], [[TMP4]]
+// CHECK-NEXT:    ret <8 x i8> [[VTBX_I]]
+//
 uint8x8_t test_vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
   return vtbx1_u8(a, b, c);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vtbx2_u8(<8 x i8> noundef %a, [2 x <8 x i8>] %b.coerce, <8 x i8> noundef %c) #0 {
-// CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
-// CHECK:   [[TMP0:%.*]] = load [2 x <8 x i8>], [2 x <8 x i8>]* [[COERCE_DIVE1]], align 8
-// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   store [2 x <8 x i8>] [[TMP0]], [2 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
-// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
-// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
-// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
-// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
-// CHECK:   [[VTBX1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[VTBX13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %a, <16 x i8> [[VTBX1_I]], <8 x i8> %c) #3
-// CHECK:   ret <8 x i8> [[VTBX13_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vtbx2_u8
+// CHECK-SAME: (<8 x i8> noundef [[A:%.*]], [2 x <8 x i8>] [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P1_I:%.*]] = alloca [[STRUCT_UINT8X8X2_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X8X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X2_T]], %struct.uint8x8x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <8 x i8>] [[B_COERCE]], [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X2_T]], %struct.uint8x8x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <8 x i8>], [2 x <8 x i8>]* [[COERCE_DIVE1]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X2_T]], %struct.uint8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <8 x i8>] [[TMP0]], [2 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
+// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X2_T]], %struct.uint8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X2_T]], %struct.uint8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK-NEXT:    [[VTBX1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[VTBX13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[A]], <16 x i8> [[VTBX1_I]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <8 x i8> [[VTBX13_I]]
+//
 uint8x8_t test_vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c) {
   return vtbx2_u8(a, b, c);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vtbx3_u8(<8 x i8> noundef %a, [3 x <8 x i8>] %b.coerce, <8 x i8> noundef %c) #0 {
-// CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x8x3_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
-// CHECK:   [[TMP0:%.*]] = load [3 x <8 x i8>], [3 x <8 x i8>]* [[COERCE_DIVE1]], align 8
-// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   store [3 x <8 x i8>] [[TMP0]], [3 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
-// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
-// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
-// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
-// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
-// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
-// CHECK:   [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> %c) #3
-// CHECK:   [[TMP4:%.*]] = icmp uge <8 x i8> %c, <i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24>
-// CHECK:   [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i8>
-// CHECK:   [[TMP6:%.*]] = and <8 x i8> [[TMP5]], %a
-// CHECK:   [[TMP7:%.*]] = xor <8 x i8> [[TMP5]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
-// CHECK:   [[TMP8:%.*]] = and <8 x i8> [[TMP7]], [[VTBL26_I]]
-// CHECK:   [[VTBX_I:%.*]] = or <8 x i8> [[TMP6]], [[TMP8]]
-// CHECK:   ret <8 x i8> [[VTBX_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vtbx3_u8
+// CHECK-SAME: (<8 x i8> noundef [[A:%.*]], [3 x <8 x i8>] [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P1_I:%.*]] = alloca [[STRUCT_UINT8X8X3_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X8X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X3_T]], %struct.uint8x8x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <8 x i8>] [[B_COERCE]], [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X3_T]], %struct.uint8x8x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <8 x i8>], [3 x <8 x i8>]* [[COERCE_DIVE1]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X3_T]], %struct.uint8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <8 x i8>] [[TMP0]], [3 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
+// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X3_T]], %struct.uint8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X3_T]], %struct.uint8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK-NEXT:    [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X3_T]], %struct.uint8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK-NEXT:    [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP4:%.*]] = icmp uge <8 x i8> [[C]], <i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24>
+// CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = and <8 x i8> [[TMP5]], [[A]]
+// CHECK-NEXT:    [[TMP7:%.*]] = xor <8 x i8> [[TMP5]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK-NEXT:    [[TMP8:%.*]] = and <8 x i8> [[TMP7]], [[VTBL26_I]]
+// CHECK-NEXT:    [[VTBX_I:%.*]] = or <8 x i8> [[TMP6]], [[TMP8]]
+// CHECK-NEXT:    ret <8 x i8> [[VTBX_I]]
+//
 uint8x8_t test_vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c) {
   return vtbx3_u8(a, b, c);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vtbx4_u8(<8 x i8> noundef %a, [4 x <8 x i8>] %b.coerce, <8 x i8> noundef %c) #0 {
-// CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x8x4_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
-// CHECK:   [[TMP0:%.*]] = load [4 x <8 x i8>], [4 x <8 x i8>]* [[COERCE_DIVE1]], align 8
-// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   store [4 x <8 x i8>] [[TMP0]], [4 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
-// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
-// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
-// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
-// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
-// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
-// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i64 0, i64 3
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
-// CHECK:   [[VTBX2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[VTBX27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[VTBX28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> %a, <16 x i8> [[VTBX2_I]], <16 x i8> [[VTBX27_I]], <8 x i8> %c) #3
-// CHECK:   ret <8 x i8> [[VTBX28_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vtbx4_u8
+// CHECK-SAME: (<8 x i8> noundef [[A:%.*]], [4 x <8 x i8>] [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P1_I:%.*]] = alloca [[STRUCT_UINT8X8X4_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X8X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X4_T]], %struct.uint8x8x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <8 x i8>] [[B_COERCE]], [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X4_T]], %struct.uint8x8x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <8 x i8>], [4 x <8 x i8>]* [[COERCE_DIVE1]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X4_T]], %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <8 x i8>] [[TMP0]], [4 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
+// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X4_T]], %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X4_T]], %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK-NEXT:    [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X4_T]], %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK-NEXT:    [[VAL5_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X4_T]], %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
+// CHECK-NEXT:    [[VTBX2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[VTBX27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[VTBX28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> [[A]], <16 x i8> [[VTBX2_I]], <16 x i8> [[VTBX27_I]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <8 x i8> [[VTBX28_I]]
+//
 uint8x8_t test_vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c) {
   return vtbx4_u8(a, b, c);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vqtbx1_u8(<8 x i8> noundef %a, <16 x i8> noundef %b, <8 x i8> noundef %c) #1 {
-// CHECK:   [[VTBX1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) #3
-// CHECK:   ret <8 x i8> [[VTBX1_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vqtbx1_u8
+// CHECK-SAME: (<8 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VTBX1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[A]], <16 x i8> [[B]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <8 x i8> [[VTBX1_I]]
+//
 uint8x8_t test_vqtbx1_u8(uint8x8_t a, uint8x16_t b, uint8x8_t c) {
   return vqtbx1_u8(a, b, c);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vqtbx2_u8(<8 x i8> noundef %a, [2 x <16 x i8>] %b.coerce, <8 x i8> noundef %c) #1 {
-// CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[B]], i32 0, i32 0
-// CHECK:   [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE1]], align 16
-// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
-// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
-// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
-// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK:   [[VTBX2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> %c) #3
-// CHECK:   ret <8 x i8> [[VTBX2_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vqtbx2_u8
+// CHECK-SAME: (<8 x i8> noundef [[A:%.*]], [2 x <16 x i8>] [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P1_I:%.*]] = alloca [[STRUCT_UINT8X16X2_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X16X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X2_T]], %struct.uint8x16x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <16 x i8>] [[B_COERCE]], [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X2_T]], %struct.uint8x16x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X2_T]], %struct.uint8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X2_T]], %struct.uint8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X2_T]], %struct.uint8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK-NEXT:    [[VTBX2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <8 x i8> [[VTBX2_I]]
+//
 uint8x8_t test_vqtbx2_u8(uint8x8_t a, uint8x16x2_t b, uint8x8_t c) {
   return vqtbx2_u8(a, b, c);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vqtbx3_u8(<8 x i8> noundef %a, [3 x <16 x i8>] %b.coerce, <8 x i8> noundef %c) #1 {
-// CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x16x3_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[B]], i32 0, i32 0
-// CHECK:   [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE1]], align 16
-// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
-// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
-// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
-// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
-// CHECK:   [[VTBX3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> %c) #3
-// CHECK:   ret <8 x i8> [[VTBX3_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vqtbx3_u8
+// CHECK-SAME: (<8 x i8> noundef [[A:%.*]], [3 x <16 x i8>] [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P1_I:%.*]] = alloca [[STRUCT_UINT8X16X3_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X16X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X3_T]], %struct.uint8x16x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <16 x i8>] [[B_COERCE]], [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X3_T]], %struct.uint8x16x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X3_T]], %struct.uint8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X3_T]], %struct.uint8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X3_T]], %struct.uint8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK-NEXT:    [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X3_T]], %struct.uint8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK-NEXT:    [[VTBX3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <8 x i8> [[VTBX3_I]]
+//
 uint8x8_t test_vqtbx3_u8(uint8x8_t a, uint8x16x3_t b, uint8x8_t c) {
   return vqtbx3_u8(a, b, c);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vqtbx4_u8(<8 x i8> noundef %a, [4 x <16 x i8>] %b.coerce, <8 x i8> noundef %c) #1 {
-// CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x16x4_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0
-// CHECK:   [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE1]], align 16
-// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
-// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
-// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
-// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
-// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
-// CHECK:   [[VTBX4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> %c) #3
-// CHECK:   ret <8 x i8> [[VTBX4_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vqtbx4_u8
+// CHECK-SAME: (<8 x i8> noundef [[A:%.*]], [4 x <16 x i8>] [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P1_I:%.*]] = alloca [[STRUCT_UINT8X16X4_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X16X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], %struct.uint8x16x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <16 x i8>] [[B_COERCE]], [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], %struct.uint8x16x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK-NEXT:    [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK-NEXT:    [[VAL5_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
+// CHECK-NEXT:    [[VTBX4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <8 x i8> [[VTBX4_I]]
+//
 uint8x8_t test_vqtbx4_u8(uint8x8_t a, uint8x16x4_t b, uint8x8_t c) {
   return vqtbx4_u8(a, b, c);
 }
 
-// CHECK-LABEL: define{{.*}} <16 x i8> @test_vqtbx1q_u8(<16 x i8> noundef %a, <16 x i8> noundef %b, <16 x i8> noundef %c) #1 {
-// CHECK:   [[VTBX1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #3
-// CHECK:   ret <16 x i8> [[VTBX1_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vqtbx1q_u8
+// CHECK-SAME: (<16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VTBX1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <16 x i8> [[VTBX1_I]]
+//
 uint8x16_t test_vqtbx1q_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
   return vqtbx1q_u8(a, b, c);
 }
 
-// CHECK-LABEL: define{{.*}} <16 x i8> @test_vqtbx2q_u8(<16 x i8> noundef %a, [2 x <16 x i8>] %b.coerce, <16 x i8> noundef %c) #1 {
-// CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[B]], i32 0, i32 0
-// CHECK:   [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE1]], align 16
-// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
-// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
-// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
-// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK:   [[VTBX2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> %c) #3
-// CHECK:   ret <16 x i8> [[VTBX2_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vqtbx2q_u8
+// CHECK-SAME: (<16 x i8> noundef [[A:%.*]], [2 x <16 x i8>] [[B_COERCE:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P1_I:%.*]] = alloca [[STRUCT_UINT8X16X2_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X16X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X2_T]], %struct.uint8x16x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <16 x i8>] [[B_COERCE]], [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X2_T]], %struct.uint8x16x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X2_T]], %struct.uint8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X2_T]], %struct.uint8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X2_T]], %struct.uint8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK-NEXT:    [[VTBX2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <16 x i8> [[VTBX2_I]]
+//
 uint8x16_t test_vqtbx2q_u8(uint8x16_t a, uint8x16x2_t b, uint8x16_t c) {
   return vqtbx2q_u8(a, b, c);
 }
 
-// CHECK-LABEL: define{{.*}} <16 x i8> @test_vqtbx3q_u8(<16 x i8> noundef %a, [3 x <16 x i8>] %b.coerce, <16 x i8> noundef %c) #1 {
-// CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x16x3_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[B]], i32 0, i32 0
-// CHECK:   [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE1]], align 16
-// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
-// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
-// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
-// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
-// CHECK:   [[VTBX3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> %c) #3
-// CHECK:   ret <16 x i8> [[VTBX3_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vqtbx3q_u8
+// CHECK-SAME: (<16 x i8> noundef [[A:%.*]], [3 x <16 x i8>] [[B_COERCE:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P1_I:%.*]] = alloca [[STRUCT_UINT8X16X3_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X16X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X3_T]], %struct.uint8x16x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <16 x i8>] [[B_COERCE]], [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X3_T]], %struct.uint8x16x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X3_T]], %struct.uint8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X3_T]], %struct.uint8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X3_T]], %struct.uint8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK-NEXT:    [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X3_T]], %struct.uint8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK-NEXT:    [[VTBX3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <16 x i8> [[VTBX3_I]]
+//
 uint8x16_t test_vqtbx3q_u8(uint8x16_t a, uint8x16x3_t b, uint8x16_t c) {
   return vqtbx3q_u8(a, b, c);
 }
 
-// CHECK-LABEL: define{{.*}} <16 x i8> @test_vqtbx4q_u8(<16 x i8> noundef %a, [4 x <16 x i8>] %b.coerce, <16 x i8> noundef %c) #1 {
-// CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x16x4_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0
-// CHECK:   [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE1]], align 16
-// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
-// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
-// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
-// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
-// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
-// CHECK:   [[VTBX4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> %c) #3
-// CHECK:   ret <16 x i8> [[VTBX4_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vqtbx4q_u8
+// CHECK-SAME: (<16 x i8> noundef [[A:%.*]], [4 x <16 x i8>] [[B_COERCE:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P1_I:%.*]] = alloca [[STRUCT_UINT8X16X4_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X16X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], %struct.uint8x16x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <16 x i8>] [[B_COERCE]], [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], %struct.uint8x16x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK-NEXT:    [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK-NEXT:    [[VAL5_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
+// CHECK-NEXT:    [[VTBX4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <16 x i8> [[VTBX4_I]]
+//
 uint8x16_t test_vqtbx4q_u8(uint8x16_t a, uint8x16x4_t b, uint8x16_t c) {
   return vqtbx4q_u8(a, b, c);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vtbl1_p8(<8 x i8> noundef %a, <8 x i8> noundef %b) #0 {
-// CHECK:   [[VTBL1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %b) #3
-// CHECK:   ret <8 x i8> [[VTBL11_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vtbl1_p8
+// CHECK-SAME: (<8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VTBL1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <8 x i8> [[VTBL11_I]]
+//
 poly8x8_t test_vtbl1_p8(poly8x8_t a, uint8x8_t b) {
   return vtbl1_p8(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vqtbl1_p8(<16 x i8> noundef %a, <8 x i8> noundef %b) #1 {
-// CHECK:   [[VTBL1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> %a, <8 x i8> %b) #3
-// CHECK:   ret <8 x i8> [[VTBL1_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vqtbl1_p8
+// CHECK-SAME: (<16 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VTBL1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[A]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <8 x i8> [[VTBL1_I]]
+//
 poly8x8_t test_vqtbl1_p8(poly8x16_t a, uint8x8_t b) {
   return vqtbl1_p8(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vtbl2_p8([2 x <8 x i8>] %a.coerce, <8 x i8> noundef %b) #0 {
-// CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[A:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[A]], i32 0, i32 0
-// CHECK:   store [2 x <8 x i8>] [[A]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[A]], i32 0, i32 0
-// CHECK:   [[TMP0:%.*]] = load [2 x <8 x i8>], [2 x <8 x i8>]* [[COERCE_DIVE1]], align 8
-// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   store [2 x <8 x i8>] [[TMP0]], [2 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
-// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
-// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
-// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
-// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
-// CHECK:   [[VTBL1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[VTBL13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %b) #3
-// CHECK:   ret <8 x i8> [[VTBL13_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vtbl2_p8
+// CHECK-SAME: ([2 x <8 x i8>] [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P0_I:%.*]] = alloca [[STRUCT_POLY8X8X2_T:%.*]], align 8
+// CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_POLY8X8X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X2_T]], %struct.poly8x8x2_t* [[A]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <8 x i8>] [[A_COERCE]], [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X2_T]], %struct.poly8x8x2_t* [[A]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <8 x i8>], [2 x <8 x i8>]* [[COERCE_DIVE1]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X2_T]], %struct.poly8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <8 x i8>] [[TMP0]], [2 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
+// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X2_T]], %struct.poly8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X2_T]], %struct.poly8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK-NEXT:    [[VTBL1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[VTBL13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <8 x i8> [[VTBL13_I]]
+//
 poly8x8_t test_vtbl2_p8(poly8x8x2_t a, uint8x8_t b) {
   return vtbl2_p8(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vqtbl2_p8([2 x <16 x i8>] %a.coerce, <8 x i8> noundef %b) #1 {
-// CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[A:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[A]], i32 0, i32 0
-// CHECK:   store [2 x <16 x i8>] [[A]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[A]], i32 0, i32 0
-// CHECK:   [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE1]], align 16
-// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
-// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
-// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
-// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK:   [[VTBL2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> %b) #3
-// CHECK:   ret <8 x i8> [[VTBL2_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vqtbl2_p8
+// CHECK-SAME: ([2 x <16 x i8>] [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P0_I:%.*]] = alloca [[STRUCT_POLY8X16X2_T:%.*]], align 16
+// CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_POLY8X16X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X2_T]], %struct.poly8x16x2_t* [[A]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <16 x i8>] [[A_COERCE]], [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X2_T]], %struct.poly8x16x2_t* [[A]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X2_T]], %struct.poly8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X2_T]], %struct.poly8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X2_T]], %struct.poly8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK-NEXT:    [[VTBL2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <8 x i8> [[VTBL2_I]]
+//
 poly8x8_t test_vqtbl2_p8(poly8x16x2_t a, uint8x8_t b) {
   return vqtbl2_p8(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vtbl3_p8([3 x <8 x i8>] %a.coerce, <8 x i8> noundef %b) #0 {
-// CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x8x3_t, align 8
-// CHECK:   [[A:%.*]] = alloca %struct.poly8x8x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[A]], i32 0, i32 0
-// CHECK:   store [3 x <8 x i8>] [[A]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[A]], i32 0, i32 0
-// CHECK:   [[TMP0:%.*]] = load [3 x <8 x i8>], [3 x <8 x i8>]* [[COERCE_DIVE1]], align 8
-// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   store [3 x <8 x i8>] [[TMP0]], [3 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
-// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
-// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
-// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
-// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
-// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
-// CHECK:   [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> %b) #3
-// CHECK:   ret <8 x i8> [[VTBL26_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vtbl3_p8
+// CHECK-SAME: ([3 x <8 x i8>] [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P0_I:%.*]] = alloca [[STRUCT_POLY8X8X3_T:%.*]], align 8
+// CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_POLY8X8X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X3_T]], %struct.poly8x8x3_t* [[A]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <8 x i8>] [[A_COERCE]], [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X3_T]], %struct.poly8x8x3_t* [[A]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <8 x i8>], [3 x <8 x i8>]* [[COERCE_DIVE1]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X3_T]], %struct.poly8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <8 x i8>] [[TMP0]], [3 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
+// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X3_T]], %struct.poly8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X3_T]], %struct.poly8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK-NEXT:    [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X3_T]], %struct.poly8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK-NEXT:    [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <8 x i8> [[VTBL26_I]]
+//
 poly8x8_t test_vtbl3_p8(poly8x8x3_t a, uint8x8_t b) {
   return vtbl3_p8(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vqtbl3_p8([3 x <16 x i8>] %a.coerce, <8 x i8> noundef %b) #1 {
-// CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x16x3_t, align 16
-// CHECK:   [[A:%.*]] = alloca %struct.poly8x16x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[A]], i32 0, i32 0
-// CHECK:   store [3 x <16 x i8>] [[A]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[A]], i32 0, i32 0
-// CHECK:   [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE1]], align 16
-// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
-// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
-// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
-// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
-// CHECK:   [[VTBL3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> %b) #3
-// CHECK:   ret <8 x i8> [[VTBL3_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vqtbl3_p8
+// CHECK-SAME: ([3 x <16 x i8>] [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P0_I:%.*]] = alloca [[STRUCT_POLY8X16X3_T:%.*]], align 16
+// CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_POLY8X16X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X3_T]], %struct.poly8x16x3_t* [[A]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <16 x i8>] [[A_COERCE]], [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X3_T]], %struct.poly8x16x3_t* [[A]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X3_T]], %struct.poly8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X3_T]], %struct.poly8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X3_T]], %struct.poly8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK-NEXT:    [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X3_T]], %struct.poly8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK-NEXT:    [[VTBL3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <8 x i8> [[VTBL3_I]]
+//
 poly8x8_t test_vqtbl3_p8(poly8x16x3_t a, uint8x8_t b) {
   return vqtbl3_p8(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vtbl4_p8([4 x <8 x i8>] %a.coerce, <8 x i8> noundef %b) #0 {
-// CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x8x4_t, align 8
-// CHECK:   [[A:%.*]] = alloca %struct.poly8x8x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[A]], i32 0, i32 0
-// CHECK:   store [4 x <8 x i8>] [[A]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[A]], i32 0, i32 0
-// CHECK:   [[TMP0:%.*]] = load [4 x <8 x i8>], [4 x <8 x i8>]* [[COERCE_DIVE1]], align 8
-// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   store [4 x <8 x i8>] [[TMP0]], [4 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
-// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
-// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
-// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
-// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
-// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
-// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i64 0, i64 3
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
-// CHECK:   [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[VTBL27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[VTBL28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL27_I]], <8 x i8> %b) #3
-// CHECK:   ret <8 x i8> [[VTBL28_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vtbl4_p8
+// CHECK-SAME: ([4 x <8 x i8>] [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P0_I:%.*]] = alloca [[STRUCT_POLY8X8X4_T:%.*]], align 8
+// CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_POLY8X8X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X4_T]], %struct.poly8x8x4_t* [[A]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <8 x i8>] [[A_COERCE]], [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X4_T]], %struct.poly8x8x4_t* [[A]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <8 x i8>], [4 x <8 x i8>]* [[COERCE_DIVE1]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X4_T]], %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <8 x i8>] [[TMP0]], [4 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
+// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X4_T]], %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X4_T]], %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK-NEXT:    [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X4_T]], %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK-NEXT:    [[VAL5_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X4_T]], %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
+// CHECK-NEXT:    [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[VTBL27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[VTBL28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL27_I]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <8 x i8> [[VTBL28_I]]
+//
 poly8x8_t test_vtbl4_p8(poly8x8x4_t a, uint8x8_t b) {
   return vtbl4_p8(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vqtbl4_p8([4 x <16 x i8>] %a.coerce, <8 x i8> noundef %b) #1 {
-// CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x16x4_t, align 16
-// CHECK:   [[A:%.*]] = alloca %struct.poly8x16x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[A]], i32 0, i32 0
-// CHECK:   store [4 x <16 x i8>] [[A]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[A]], i32 0, i32 0
-// CHECK:   [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE1]], align 16
-// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
-// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
-// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
-// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
-// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
-// CHECK:   [[VTBL4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> %b) #3
-// CHECK:   ret <8 x i8> [[VTBL4_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vqtbl4_p8
+// CHECK-SAME: ([4 x <16 x i8>] [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P0_I:%.*]] = alloca [[STRUCT_POLY8X16X4_T:%.*]], align 16
+// CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_POLY8X16X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], %struct.poly8x16x4_t* [[A]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <16 x i8>] [[A_COERCE]], [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], %struct.poly8x16x4_t* [[A]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK-NEXT:    [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK-NEXT:    [[VAL5_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
+// CHECK-NEXT:    [[VTBL4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <8 x i8> [[VTBL4_I]]
+//
 poly8x8_t test_vqtbl4_p8(poly8x16x4_t a, uint8x8_t b) {
   return vqtbl4_p8(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} <16 x i8> @test_vqtbl1q_p8(<16 x i8> noundef %a, <16 x i8> noundef %b) #1 {
-// CHECK:   [[VTBL1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> %b) #3
-// CHECK:   ret <16 x i8> [[VTBL1_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vqtbl1q_p8
+// CHECK-SAME: (<16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VTBL1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <16 x i8> [[VTBL1_I]]
+//
 poly8x16_t test_vqtbl1q_p8(poly8x16_t a, uint8x16_t b) {
   return vqtbl1q_p8(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} <16 x i8> @test_vqtbl2q_p8([2 x <16 x i8>] %a.coerce, <16 x i8> noundef %b) #1 {
-// CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[A:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[A]], i32 0, i32 0
-// CHECK:   store [2 x <16 x i8>] [[A]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[A]], i32 0, i32 0
-// CHECK:   [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE1]], align 16
-// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
-// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
-// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
-// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK:   [[VTBL2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> %b) #3
-// CHECK:   ret <16 x i8> [[VTBL2_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vqtbl2q_p8
+// CHECK-SAME: ([2 x <16 x i8>] [[A_COERCE:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P0_I:%.*]] = alloca [[STRUCT_POLY8X16X2_T:%.*]], align 16
+// CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_POLY8X16X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X2_T]], %struct.poly8x16x2_t* [[A]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <16 x i8>] [[A_COERCE]], [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X2_T]], %struct.poly8x16x2_t* [[A]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X2_T]], %struct.poly8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X2_T]], %struct.poly8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X2_T]], %struct.poly8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK-NEXT:    [[VTBL2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <16 x i8> [[VTBL2_I]]
+//
 poly8x16_t test_vqtbl2q_p8(poly8x16x2_t a, uint8x16_t b) {
   return vqtbl2q_p8(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} <16 x i8> @test_vqtbl3q_p8([3 x <16 x i8>] %a.coerce, <16 x i8> noundef %b) #1 {
-// CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x16x3_t, align 16
-// CHECK:   [[A:%.*]] = alloca %struct.poly8x16x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[A]], i32 0, i32 0
-// CHECK:   store [3 x <16 x i8>] [[A]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[A]], i32 0, i32 0
-// CHECK:   [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE1]], align 16
-// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
-// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
-// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
-// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
-// CHECK:   [[VTBL3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> %b) #3
-// CHECK:   ret <16 x i8> [[VTBL3_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vqtbl3q_p8
+// CHECK-SAME: ([3 x <16 x i8>] [[A_COERCE:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P0_I:%.*]] = alloca [[STRUCT_POLY8X16X3_T:%.*]], align 16
+// CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_POLY8X16X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X3_T]], %struct.poly8x16x3_t* [[A]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <16 x i8>] [[A_COERCE]], [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X3_T]], %struct.poly8x16x3_t* [[A]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X3_T]], %struct.poly8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X3_T]], %struct.poly8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X3_T]], %struct.poly8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK-NEXT:    [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X3_T]], %struct.poly8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK-NEXT:    [[VTBL3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <16 x i8> [[VTBL3_I]]
+//
 poly8x16_t test_vqtbl3q_p8(poly8x16x3_t a, uint8x16_t b) {
   return vqtbl3q_p8(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} <16 x i8> @test_vqtbl4q_p8([4 x <16 x i8>] %a.coerce, <16 x i8> noundef %b) #1 {
-// CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x16x4_t, align 16
-// CHECK:   [[A:%.*]] = alloca %struct.poly8x16x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[A]], i32 0, i32 0
-// CHECK:   store [4 x <16 x i8>] [[A]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[A]], i32 0, i32 0
-// CHECK:   [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE1]], align 16
-// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
-// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
-// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
-// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
-// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
-// CHECK:   [[VTBL4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> %b) #3
-// CHECK:   ret <16 x i8> [[VTBL4_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vqtbl4q_p8
+// CHECK-SAME: ([4 x <16 x i8>] [[A_COERCE:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P0_I:%.*]] = alloca [[STRUCT_POLY8X16X4_T:%.*]], align 16
+// CHECK-NEXT:    [[A:%.*]] = alloca [[STRUCT_POLY8X16X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], %struct.poly8x16x4_t* [[A]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <16 x i8>] [[A_COERCE]], [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], %struct.poly8x16x4_t* [[A]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK-NEXT:    [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK-NEXT:    [[VAL5_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
+// CHECK-NEXT:    [[VTBL4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <16 x i8> [[VTBL4_I]]
+//
 poly8x16_t test_vqtbl4q_p8(poly8x16x4_t a, uint8x16_t b) {
   return vqtbl4q_p8(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vtbx1_p8(<8 x i8> noundef %a, <8 x i8> noundef %b, <8 x i8> noundef %c) #0 {
-// CHECK:   [[VTBL1_I:%.*]] = shufflevector <8 x i8> %b, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %c) #3
-// CHECK:   [[TMP0:%.*]] = icmp uge <8 x i8> %c, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
-// CHECK:   [[TMP1:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = and <8 x i8> [[TMP1]], %a
-// CHECK:   [[TMP3:%.*]] = xor <8 x i8> [[TMP1]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
-// CHECK:   [[TMP4:%.*]] = and <8 x i8> [[TMP3]], [[VTBL11_I]]
-// CHECK:   [[VTBX_I:%.*]] = or <8 x i8> [[TMP2]], [[TMP4]]
-// CHECK:   ret <8 x i8> [[VTBX_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vtbx1_p8
+// CHECK-SAME: (<8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VTBL1_I:%.*]] = shufflevector <8 x i8> [[B]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp uge <8 x i8> [[C]], <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
+// CHECK-NEXT:    [[TMP1:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = and <8 x i8> [[TMP1]], [[A]]
+// CHECK-NEXT:    [[TMP3:%.*]] = xor <8 x i8> [[TMP1]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK-NEXT:    [[TMP4:%.*]] = and <8 x i8> [[TMP3]], [[VTBL11_I]]
+// CHECK-NEXT:    [[VTBX_I:%.*]] = or <8 x i8> [[TMP2]], [[TMP4]]
+// CHECK-NEXT:    ret <8 x i8> [[VTBX_I]]
+//
 poly8x8_t test_vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c) {
   return vtbx1_p8(a, b, c);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vtbx2_p8(<8 x i8> noundef %a, [2 x <8 x i8>] %b.coerce, <8 x i8> noundef %c) #0 {
-// CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
-// CHECK:   [[TMP0:%.*]] = load [2 x <8 x i8>], [2 x <8 x i8>]* [[COERCE_DIVE1]], align 8
-// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   store [2 x <8 x i8>] [[TMP0]], [2 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
-// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
-// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
-// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
-// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
-// CHECK:   [[VTBX1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[VTBX13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %a, <16 x i8> [[VTBX1_I]], <8 x i8> %c) #3
-// CHECK:   ret <8 x i8> [[VTBX13_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vtbx2_p8
+// CHECK-SAME: (<8 x i8> noundef [[A:%.*]], [2 x <8 x i8>] [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P1_I:%.*]] = alloca [[STRUCT_POLY8X8X2_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X8X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X2_T]], %struct.poly8x8x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <8 x i8>] [[B_COERCE]], [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X2_T]], %struct.poly8x8x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <8 x i8>], [2 x <8 x i8>]* [[COERCE_DIVE1]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X2_T]], %struct.poly8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <8 x i8>] [[TMP0]], [2 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
+// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X2_T]], %struct.poly8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X2_T]], %struct.poly8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK-NEXT:    [[VTBX1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[VTBX13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[A]], <16 x i8> [[VTBX1_I]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <8 x i8> [[VTBX13_I]]
+//
 poly8x8_t test_vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c) {
   return vtbx2_p8(a, b, c);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vtbx3_p8(<8 x i8> noundef %a, [3 x <8 x i8>] %b.coerce, <8 x i8> noundef %c) #0 {
-// CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x8x3_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
-// CHECK:   [[TMP0:%.*]] = load [3 x <8 x i8>], [3 x <8 x i8>]* [[COERCE_DIVE1]], align 8
-// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   store [3 x <8 x i8>] [[TMP0]], [3 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
-// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
-// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
-// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
-// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
-// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
-// CHECK:   [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> %c) #3
-// CHECK:   [[TMP4:%.*]] = icmp uge <8 x i8> %c, <i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24>
-// CHECK:   [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i8>
-// CHECK:   [[TMP6:%.*]] = and <8 x i8> [[TMP5]], %a
-// CHECK:   [[TMP7:%.*]] = xor <8 x i8> [[TMP5]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
-// CHECK:   [[TMP8:%.*]] = and <8 x i8> [[TMP7]], [[VTBL26_I]]
-// CHECK:   [[VTBX_I:%.*]] = or <8 x i8> [[TMP6]], [[TMP8]]
-// CHECK:   ret <8 x i8> [[VTBX_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vtbx3_p8
+// CHECK-SAME: (<8 x i8> noundef [[A:%.*]], [3 x <8 x i8>] [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P1_I:%.*]] = alloca [[STRUCT_POLY8X8X3_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X8X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X3_T]], %struct.poly8x8x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <8 x i8>] [[B_COERCE]], [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X3_T]], %struct.poly8x8x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <8 x i8>], [3 x <8 x i8>]* [[COERCE_DIVE1]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X3_T]], %struct.poly8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <8 x i8>] [[TMP0]], [3 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
+// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X3_T]], %struct.poly8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X3_T]], %struct.poly8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK-NEXT:    [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X3_T]], %struct.poly8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK-NEXT:    [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP4:%.*]] = icmp uge <8 x i8> [[C]], <i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24>
+// CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[TMP6:%.*]] = and <8 x i8> [[TMP5]], [[A]]
+// CHECK-NEXT:    [[TMP7:%.*]] = xor <8 x i8> [[TMP5]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK-NEXT:    [[TMP8:%.*]] = and <8 x i8> [[TMP7]], [[VTBL26_I]]
+// CHECK-NEXT:    [[VTBX_I:%.*]] = or <8 x i8> [[TMP6]], [[TMP8]]
+// CHECK-NEXT:    ret <8 x i8> [[VTBX_I]]
+//
 poly8x8_t test_vtbx3_p8(poly8x8_t a, poly8x8x3_t b, uint8x8_t c) {
   return vtbx3_p8(a, b, c);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vtbx4_p8(<8 x i8> noundef %a, [4 x <8 x i8>] %b.coerce, <8 x i8> noundef %c) #0 {
-// CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x8x4_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
-// CHECK:   [[TMP0:%.*]] = load [4 x <8 x i8>], [4 x <8 x i8>]* [[COERCE_DIVE1]], align 8
-// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   store [4 x <8 x i8>] [[TMP0]], [4 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
-// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
-// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
-// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
-// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
-// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
-// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i64 0, i64 3
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
-// CHECK:   [[VTBX2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[VTBX27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK:   [[VTBX28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> %a, <16 x i8> [[VTBX2_I]], <16 x i8> [[VTBX27_I]], <8 x i8> %c) #3
-// CHECK:   ret <8 x i8> [[VTBX28_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vtbx4_p8
+// CHECK-SAME: (<8 x i8> noundef [[A:%.*]], [4 x <8 x i8>] [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P1_I:%.*]] = alloca [[STRUCT_POLY8X8X4_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X8X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X4_T]], %struct.poly8x8x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <8 x i8>] [[B_COERCE]], [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X4_T]], %struct.poly8x8x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <8 x i8>], [4 x <8 x i8>]* [[COERCE_DIVE1]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X4_T]], %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <8 x i8>] [[TMP0]], [4 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
+// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X4_T]], %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X4_T]], %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK-NEXT:    [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X4_T]], %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK-NEXT:    [[VAL5_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X4_T]], %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
+// CHECK-NEXT:    [[VTBX2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[VTBX27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK-NEXT:    [[VTBX28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> [[A]], <16 x i8> [[VTBX2_I]], <16 x i8> [[VTBX27_I]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <8 x i8> [[VTBX28_I]]
+//
 poly8x8_t test_vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c) {
   return vtbx4_p8(a, b, c);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vqtbx1_p8(<8 x i8> noundef %a, <16 x i8> noundef %b, <8 x i8> noundef %c) #1 {
-// CHECK:   [[VTBX1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) #3
-// CHECK:   ret <8 x i8> [[VTBX1_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vqtbx1_p8
+// CHECK-SAME: (<8 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VTBX1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[A]], <16 x i8> [[B]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <8 x i8> [[VTBX1_I]]
+//
 poly8x8_t test_vqtbx1_p8(poly8x8_t a, uint8x16_t b, uint8x8_t c) {
   return vqtbx1_p8(a, b, c);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vqtbx2_p8(<8 x i8> noundef %a, [2 x <16 x i8>] %b.coerce, <8 x i8> noundef %c) #1 {
-// CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[B]], i32 0, i32 0
-// CHECK:   [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE1]], align 16
-// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
-// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
-// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
-// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK:   [[VTBX2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> %c) #3
-// CHECK:   ret <8 x i8> [[VTBX2_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vqtbx2_p8
+// CHECK-SAME: (<8 x i8> noundef [[A:%.*]], [2 x <16 x i8>] [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P1_I:%.*]] = alloca [[STRUCT_POLY8X16X2_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X16X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X2_T]], %struct.poly8x16x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <16 x i8>] [[B_COERCE]], [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X2_T]], %struct.poly8x16x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X2_T]], %struct.poly8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X2_T]], %struct.poly8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X2_T]], %struct.poly8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK-NEXT:    [[VTBX2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <8 x i8> [[VTBX2_I]]
+//
 poly8x8_t test_vqtbx2_p8(poly8x8_t a, poly8x16x2_t b, uint8x8_t c) {
   return vqtbx2_p8(a, b, c);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vqtbx3_p8(<8 x i8> noundef %a, [3 x <16 x i8>] %b.coerce, <8 x i8> noundef %c) #1 {
-// CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x16x3_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0
-// CHECK:   [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE1]], align 16
-// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
-// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
-// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
-// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
-// CHECK:   [[VTBX3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> %c) #3
-// CHECK:   ret <8 x i8> [[VTBX3_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vqtbx3_p8
+// CHECK-SAME: (<8 x i8> noundef [[A:%.*]], [3 x <16 x i8>] [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P1_I:%.*]] = alloca [[STRUCT_POLY8X16X3_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X16X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X3_T]], %struct.poly8x16x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <16 x i8>] [[B_COERCE]], [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X3_T]], %struct.poly8x16x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X3_T]], %struct.poly8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X3_T]], %struct.poly8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X3_T]], %struct.poly8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK-NEXT:    [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X3_T]], %struct.poly8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK-NEXT:    [[VTBX3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <8 x i8> [[VTBX3_I]]
+//
 poly8x8_t test_vqtbx3_p8(poly8x8_t a, poly8x16x3_t b, uint8x8_t c) {
   return vqtbx3_p8(a, b, c);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vqtbx4_p8(<8 x i8> noundef %a, [4 x <16 x i8>] %b.coerce, <8 x i8> noundef %c) #1 {
-// CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x16x4_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0
-// CHECK:   [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE1]], align 16
-// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
-// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
-// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
-// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
-// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
-// CHECK:   [[VTBX4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> %c) #3
-// CHECK:   ret <8 x i8> [[VTBX4_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vqtbx4_p8
+// CHECK-SAME: (<8 x i8> noundef [[A:%.*]], [4 x <16 x i8>] [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P1_I:%.*]] = alloca [[STRUCT_POLY8X16X4_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X16X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], %struct.poly8x16x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <16 x i8>] [[B_COERCE]], [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], %struct.poly8x16x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK-NEXT:    [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK-NEXT:    [[VAL5_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
+// CHECK-NEXT:    [[VTBX4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <8 x i8> [[VTBX4_I]]
+//
 poly8x8_t test_vqtbx4_p8(poly8x8_t a, poly8x16x4_t b, uint8x8_t c) {
   return vqtbx4_p8(a, b, c);
 }
 
-// CHECK-LABEL: define{{.*}} <16 x i8> @test_vqtbx1q_p8(<16 x i8> noundef %a, <16 x i8> noundef %b, <16 x i8> noundef %c) #1 {
-// CHECK:   [[VTBX1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #3
-// CHECK:   ret <16 x i8> [[VTBX1_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vqtbx1q_p8
+// CHECK-SAME: (<16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VTBX1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <16 x i8> [[VTBX1_I]]
+//
 poly8x16_t test_vqtbx1q_p8(poly8x16_t a, uint8x16_t b, uint8x16_t c) {
   return vqtbx1q_p8(a, b, c);
 }
 
-// CHECK-LABEL: define{{.*}} <16 x i8> @test_vqtbx2q_p8(<16 x i8> noundef %a, [2 x <16 x i8>] %b.coerce, <16 x i8> noundef %c) #1 {
-// CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[B]], i32 0, i32 0
-// CHECK:   [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE1]], align 16
-// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
-// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
-// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
-// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK:   [[VTBX2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> %c) #3
-// CHECK:   ret <16 x i8> [[VTBX2_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vqtbx2q_p8
+// CHECK-SAME: (<16 x i8> noundef [[A:%.*]], [2 x <16 x i8>] [[B_COERCE:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P1_I:%.*]] = alloca [[STRUCT_POLY8X16X2_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X16X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X2_T]], %struct.poly8x16x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <16 x i8>] [[B_COERCE]], [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X2_T]], %struct.poly8x16x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X2_T]], %struct.poly8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X2_T]], %struct.poly8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X2_T]], %struct.poly8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK-NEXT:    [[VTBX2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <16 x i8> [[VTBX2_I]]
+//
 poly8x16_t test_vqtbx2q_p8(poly8x16_t a, poly8x16x2_t b, uint8x16_t c) {
   return vqtbx2q_p8(a, b, c);
 }
 
-// CHECK-LABEL: define{{.*}} <16 x i8> @test_vqtbx3q_p8(<16 x i8> noundef %a, [3 x <16 x i8>] %b.coerce, <16 x i8> noundef %c) #1 {
-// CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x16x3_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0
-// CHECK:   [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE1]], align 16
-// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
-// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
-// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
-// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
-// CHECK:   [[VTBX3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> %c) #3
-// CHECK:   ret <16 x i8> [[VTBX3_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vqtbx3q_p8
+// CHECK-SAME: (<16 x i8> noundef [[A:%.*]], [3 x <16 x i8>] [[B_COERCE:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P1_I:%.*]] = alloca [[STRUCT_POLY8X16X3_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X16X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X3_T]], %struct.poly8x16x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <16 x i8>] [[B_COERCE]], [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X3_T]], %struct.poly8x16x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X3_T]], %struct.poly8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X3_T]], %struct.poly8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X3_T]], %struct.poly8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK-NEXT:    [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X3_T]], %struct.poly8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK-NEXT:    [[VTBX3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <16 x i8> [[VTBX3_I]]
+//
 poly8x16_t test_vqtbx3q_p8(poly8x16_t a, poly8x16x3_t b, uint8x16_t c) {
   return vqtbx3q_p8(a, b, c);
 }
 
-// CHECK-LABEL: define{{.*}} <16 x i8> @test_vqtbx4q_p8(<16 x i8> noundef %a, [4 x <16 x i8>] %b.coerce, <16 x i8> noundef %c) #1 {
-// CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x16x4_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0
-// CHECK:   [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE1]], align 16
-// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
-// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
-// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
-// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
-// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
-// CHECK:   [[VTBX4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> %c) #3
-// CHECK:   ret <16 x i8> [[VTBX4_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vqtbx4q_p8
+// CHECK-SAME: (<16 x i8> noundef [[A:%.*]], [4 x <16 x i8>] [[B_COERCE:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__P1_I:%.*]] = alloca [[STRUCT_POLY8X16X4_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X16X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], %struct.poly8x16x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <16 x i8>] [[B_COERCE]], [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], %struct.poly8x16x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK-NEXT:    [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK-NEXT:    [[VAL5_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
+// CHECK-NEXT:    [[VTBX4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <16 x i8> [[VTBX4_I]]
+//
 poly8x16_t test_vqtbx4q_p8(poly8x16_t a, poly8x16x4_t b, uint8x16_t c) {
   return vqtbx4q_p8(a, b, c);
 }
 
-// CHECK: attributes #0 ={{.*}}"min-legal-vector-width"="64"
-// CHECK: attributes #1 ={{.*}}"min-legal-vector-width"="128"

diff  --git a/clang/test/CodeGen/aarch64-poly128.c b/clang/test/CodeGen/aarch64-poly128.c
index 6d642e4d19d92..2e58c0f286e6f 100644
--- a/clang/test/CodeGen/aarch64-poly128.c
+++ b/clang/test/CodeGen/aarch64-poly128.c
@@ -1,3 +1,4 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature
 // RUN: %clang_cc1 -no-opaque-pointers -triple arm64-none-linux-gnu -target-feature +neon \
 // RUN: -disable-O0-optnone -ffp-contract=fast -emit-llvm -o - %s | opt -S -mem2reg \
 // RUN:  | FileCheck %s
@@ -13,238 +14,331 @@
 
 #include <arm_neon.h>
 
-// CHECK-LABEL: define{{.*}} void @test_vstrq_p128(i128* noundef %ptr, i128 noundef %val) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast i128* %ptr to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i128*
-// CHECK:   store i128 %val, i128* [[TMP1]]
-// CHECK:   ret void
+// CHECK-LABEL: define {{[^@]+}}@test_vstrq_p128
+// CHECK-SAME: (i128* noundef [[PTR:%.*]], i128 noundef [[VAL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128* [[PTR]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i128*
+// CHECK-NEXT:    store i128 [[VAL]], i128* [[TMP1]], align 16
+// CHECK-NEXT:    ret void
+//
 void test_vstrq_p128(poly128_t * ptr, poly128_t val) {
   vstrq_p128(ptr, val);
 
 }
 
-// CHECK-LABEL: define{{.*}} i128 @test_vldrq_p128(i128* noundef %ptr) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast i128* %ptr to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i128*
-// CHECK:   [[TMP2:%.*]] = load i128, i128* [[TMP1]]
-// CHECK:   ret i128 [[TMP2]]
+// CHECK-LABEL: define {{[^@]+}}@test_vldrq_p128
+// CHECK-SAME: (i128* noundef [[PTR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128* [[PTR]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i128*
+// CHECK-NEXT:    [[TMP2:%.*]] = load i128, i128* [[TMP1]], align 16
+// CHECK-NEXT:    ret i128 [[TMP2]]
+//
 poly128_t test_vldrq_p128(poly128_t * ptr) {
   return vldrq_p128(ptr);
 
 }
 
-// CHECK-LABEL: define{{.*}} void @test_ld_st_p128(i128* noundef %ptr) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast i128* %ptr to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i128*
-// CHECK:   [[TMP2:%.*]] = load i128, i128* [[TMP1]]
-// CHECK:   [[ADD_PTR:%.*]] = getelementptr inbounds i128, i128* %ptr, i64 1
-// CHECK:   [[TMP3:%.*]] = bitcast i128* [[ADD_PTR]] to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i128*
-// CHECK:   store i128 [[TMP2]], i128* [[TMP4]]
-// CHECK:   ret void
+// CHECK-LABEL: define {{[^@]+}}@test_ld_st_p128
+// CHECK-SAME: (i128* noundef [[PTR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128* [[PTR]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i128*
+// CHECK-NEXT:    [[TMP2:%.*]] = load i128, i128* [[TMP1]], align 16
+// CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i128, i128* [[PTR]], i64 1
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i128* [[ADD_PTR]] to i8*
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i128*
+// CHECK-NEXT:    store i128 [[TMP2]], i128* [[TMP4]], align 16
+// CHECK-NEXT:    ret void
+//
 void test_ld_st_p128(poly128_t * ptr) {
    vstrq_p128(ptr+1, vldrq_p128(ptr));
 
 }
 
-// CHECK-LABEL: define{{.*}} i128 @test_vmull_p64(i64 noundef %a, i64 noundef %b) #0 {
-// CHECK:   [[VMULL_P64_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %a, i64 %b) #3
-// CHECK:   [[VMULL_P641_I:%.*]] = bitcast <16 x i8> [[VMULL_P64_I]] to i128
-// CHECK:   ret i128 [[VMULL_P641_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmull_p64
+// CHECK-SAME: (i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VMULL_P64_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 [[A]], i64 [[B]]) #[[ATTR3:[0-9]+]]
+// CHECK-NEXT:    [[VMULL_P641_I:%.*]] = bitcast <16 x i8> [[VMULL_P64_I]] to i128
+// CHECK-NEXT:    ret i128 [[VMULL_P641_I]]
+//
 poly128_t test_vmull_p64(poly64_t a, poly64_t b) {
   return vmull_p64(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} i128 @test_vmull_high_p64(<2 x i64> noundef %a, <2 x i64> noundef %b) #1 {
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> <i32 1>
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> [[SHUFFLE_I_I]] to i64
-// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <2 x i64> %b, <2 x i64> %b, <1 x i32> <i32 1>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I7_I]] to i64
-// CHECK:   [[VMULL_P64_I_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 [[TMP0]], i64 [[TMP1]]) #3
-// CHECK:   [[VMULL_P641_I_I:%.*]] = bitcast <16 x i8> [[VMULL_P64_I_I]] to i128
-// CHECK:   ret i128 [[VMULL_P641_I_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmull_high_p64
+// CHECK-SAME: (<2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[SHUFFLE_I5:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[A]], <1 x i32> <i32 1>
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[SHUFFLE_I5]] to i64
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[B]], <2 x i64> [[B]], <1 x i32> <i32 1>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I]] to i64
+// CHECK-NEXT:    [[VMULL_P64_I_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 [[TMP0]], i64 [[TMP1]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VMULL_P641_I_I:%.*]] = bitcast <16 x i8> [[VMULL_P64_I_I]] to i128
+// CHECK-NEXT:    ret i128 [[VMULL_P641_I_I]]
+//
 poly128_t test_vmull_high_p64(poly64x2_t a, poly64x2_t b) {
   return vmull_high_p64(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} i128 @test_vreinterpretq_p128_s8(<16 x i8> noundef %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to i128
-// CHECK:   ret i128 [[TMP0]]
+// CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_s8
+// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to i128
+// CHECK-NEXT:    ret i128 [[TMP0]]
+//
 poly128_t test_vreinterpretq_p128_s8(int8x16_t a) {
   return vreinterpretq_p128_s8(a);
 }
 
-// CHECK-LABEL: define{{.*}} i128 @test_vreinterpretq_p128_s16(<8 x i16> noundef %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to i128
-// CHECK:   ret i128 [[TMP0]]
+// CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_s16
+// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to i128
+// CHECK-NEXT:    ret i128 [[TMP0]]
+//
 poly128_t test_vreinterpretq_p128_s16(int16x8_t a) {
   return vreinterpretq_p128_s16(a);
 }
 
-// CHECK-LABEL: define{{.*}} i128 @test_vreinterpretq_p128_s32(<4 x i32> noundef %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to i128
-// CHECK:   ret i128 [[TMP0]]
+// CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_s32
+// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to i128
+// CHECK-NEXT:    ret i128 [[TMP0]]
+//
 poly128_t test_vreinterpretq_p128_s32(int32x4_t a) {
   return vreinterpretq_p128_s32(a);
 }
 
-// CHECK-LABEL: define{{.*}} i128 @test_vreinterpretq_p128_s64(<2 x i64> noundef %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to i128
-// CHECK:   ret i128 [[TMP0]]
+// CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_s64
+// CHECK-SAME: (<2 x i64> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to i128
+// CHECK-NEXT:    ret i128 [[TMP0]]
+//
 poly128_t test_vreinterpretq_p128_s64(int64x2_t a) {
   return vreinterpretq_p128_s64(a);
 }
 
-// CHECK-LABEL: define{{.*}} i128 @test_vreinterpretq_p128_u8(<16 x i8> noundef %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to i128
-// CHECK:   ret i128 [[TMP0]]
+// CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_u8
+// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to i128
+// CHECK-NEXT:    ret i128 [[TMP0]]
+//
 poly128_t test_vreinterpretq_p128_u8(uint8x16_t a) {
   return vreinterpretq_p128_u8(a);
 }
 
-// CHECK-LABEL: define{{.*}} i128 @test_vreinterpretq_p128_u16(<8 x i16> noundef %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to i128
-// CHECK:   ret i128 [[TMP0]]
+// CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_u16
+// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to i128
+// CHECK-NEXT:    ret i128 [[TMP0]]
+//
 poly128_t test_vreinterpretq_p128_u16(uint16x8_t a) {
   return vreinterpretq_p128_u16(a);
 }
 
-// CHECK-LABEL: define{{.*}} i128 @test_vreinterpretq_p128_u32(<4 x i32> noundef %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to i128
-// CHECK:   ret i128 [[TMP0]]
+// CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_u32
+// CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A]] to i128
+// CHECK-NEXT:    ret i128 [[TMP0]]
+//
 poly128_t test_vreinterpretq_p128_u32(uint32x4_t a) {
   return vreinterpretq_p128_u32(a);
 }
 
-// CHECK-LABEL: define{{.*}} i128 @test_vreinterpretq_p128_u64(<2 x i64> noundef %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to i128
-// CHECK:   ret i128 [[TMP0]]
+// CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_u64
+// CHECK-SAME: (<2 x i64> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to i128
+// CHECK-NEXT:    ret i128 [[TMP0]]
+//
 poly128_t test_vreinterpretq_p128_u64(uint64x2_t a) {
   return vreinterpretq_p128_u64(a);
 }
 
-// CHECK-LABEL: define{{.*}} i128 @test_vreinterpretq_p128_f32(<4 x float> noundef %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to i128
-// CHECK:   ret i128 [[TMP0]]
+// CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_f32
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to i128
+// CHECK-NEXT:    ret i128 [[TMP0]]
+//
 poly128_t test_vreinterpretq_p128_f32(float32x4_t a) {
   return vreinterpretq_p128_f32(a);
 }
 
-// CHECK-LABEL: define{{.*}} i128 @test_vreinterpretq_p128_f64(<2 x double> noundef %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to i128
-// CHECK:   ret i128 [[TMP0]]
+// CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_f64
+// CHECK-SAME: (<2 x double> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A]] to i128
+// CHECK-NEXT:    ret i128 [[TMP0]]
+//
 poly128_t test_vreinterpretq_p128_f64(float64x2_t a) {
   return vreinterpretq_p128_f64(a);
 }
 
-// CHECK-LABEL: define{{.*}} i128 @test_vreinterpretq_p128_p8(<16 x i8> noundef %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to i128
-// CHECK:   ret i128 [[TMP0]]
+// CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_p8
+// CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A]] to i128
+// CHECK-NEXT:    ret i128 [[TMP0]]
+//
 poly128_t test_vreinterpretq_p128_p8(poly8x16_t a) {
   return vreinterpretq_p128_p8(a);
 }
 
-// CHECK-LABEL: define{{.*}} i128 @test_vreinterpretq_p128_p16(<8 x i16> noundef %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to i128
-// CHECK:   ret i128 [[TMP0]]
+// CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_p16
+// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to i128
+// CHECK-NEXT:    ret i128 [[TMP0]]
+//
 poly128_t test_vreinterpretq_p128_p16(poly16x8_t a) {
   return vreinterpretq_p128_p16(a);
 }
 
-// CHECK-LABEL: define{{.*}} i128 @test_vreinterpretq_p128_p64(<2 x i64> noundef %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to i128
-// CHECK:   ret i128 [[TMP0]]
+// CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p128_p64
+// CHECK-SAME: (<2 x i64> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A]] to i128
+// CHECK-NEXT:    ret i128 [[TMP0]]
+//
 poly128_t test_vreinterpretq_p128_p64(poly64x2_t a) {
   return vreinterpretq_p128_p64(a);
 }
 
-// CHECK-LABEL: define{{.*}} <16 x i8> @test_vreinterpretq_s8_p128(i128 noundef %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_s8_p128
+// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 int8x16_t test_vreinterpretq_s8_p128(poly128_t a) {
   return vreinterpretq_s8_p128(a);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i16> @test_vreinterpretq_s16_p128(i128 noundef %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_s16_p128
+// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 int16x8_t test_vreinterpretq_s16_p128(poly128_t  a) {
   return vreinterpretq_s16_p128(a);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x i32> @test_vreinterpretq_s32_p128(i128 noundef %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP0]]
+// CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_s32_p128
+// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 int32x4_t test_vreinterpretq_s32_p128(poly128_t a) {
   return vreinterpretq_s32_p128(a);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i64> @test_vreinterpretq_s64_p128(i128 noundef %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_s64_p128
+// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 int64x2_t test_vreinterpretq_s64_p128(poly128_t  a) {
   return vreinterpretq_s64_p128(a);
 }
 
-// CHECK-LABEL: define{{.*}} <16 x i8> @test_vreinterpretq_u8_p128(i128 noundef %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_u8_p128
+// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 uint8x16_t test_vreinterpretq_u8_p128(poly128_t  a) {
   return vreinterpretq_u8_p128(a);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i16> @test_vreinterpretq_u16_p128(i128 noundef %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_u16_p128
+// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 uint16x8_t test_vreinterpretq_u16_p128(poly128_t  a) {
   return vreinterpretq_u16_p128(a);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x i32> @test_vreinterpretq_u32_p128(i128 noundef %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP0]]
+// CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_u32_p128
+// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 uint32x4_t test_vreinterpretq_u32_p128(poly128_t  a) {
   return vreinterpretq_u32_p128(a);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i64> @test_vreinterpretq_u64_p128(i128 noundef %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_u64_p128
+// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 uint64x2_t test_vreinterpretq_u64_p128(poly128_t  a) {
   return vreinterpretq_u64_p128(a);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x float> @test_vreinterpretq_f32_p128(i128 noundef %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <4 x float>
-// CHECK:   ret <4 x float> [[TMP0]]
+// CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_f32_p128
+// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
 float32x4_t test_vreinterpretq_f32_p128(poly128_t  a) {
   return vreinterpretq_f32_p128(a);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x double> @test_vreinterpretq_f64_p128(i128 noundef %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <2 x double>
-// CHECK:   ret <2 x double> [[TMP0]]
+// CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_f64_p128
+// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A]] to <2 x double>
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
 float64x2_t test_vreinterpretq_f64_p128(poly128_t  a) {
   return vreinterpretq_f64_p128(a);
 }
 
-// CHECK-LABEL: define{{.*}} <16 x i8> @test_vreinterpretq_p8_p128(i128 noundef %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p8_p128
+// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 poly8x16_t test_vreinterpretq_p8_p128(poly128_t  a) {
   return vreinterpretq_p8_p128(a);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i16> @test_vreinterpretq_p16_p128(i128 noundef %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p16_p128
+// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 poly16x8_t test_vreinterpretq_p16_p128(poly128_t  a) {
   return vreinterpretq_p16_p128(a);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i64> @test_vreinterpretq_p64_p128(i128 noundef %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: define {{[^@]+}}@test_vreinterpretq_p64_p128
+// CHECK-SAME: (i128 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 poly64x2_t test_vreinterpretq_p64_p128(poly128_t  a) {
   return vreinterpretq_p64_p128(a);
 }

diff  --git a/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c b/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
index 8bfb532442dd6..3b378527f9cef 100644
--- a/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
+++ b/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
@@ -1,3 +1,4 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature
 // RUN: %clang_cc1 -no-opaque-pointers -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.2a\
 // RUN: -fallow-half-arguments-and-returns -flax-vector-conversions=none -S -disable-O0-optnone -emit-llvm -o - %s \
 // RUN: | opt -S -mem2reg \
@@ -7,1643 +8,2463 @@
 
 #include <arm_neon.h>
 
-// CHECK-LABEL: test_vabs_f16
-// CHECK:  [[ABS:%.*]] =  call <4 x half> @llvm.fabs.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x half> [[ABS]]
+// CHECK-LABEL: define {{[^@]+}}@test_vabs_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VABS1_I:%.*]] = call <4 x half> @llvm.fabs.v4f16(<4 x half> [[A]]) #[[ATTR4:[0-9]+]]
+// CHECK-NEXT:    ret <4 x half> [[VABS1_I]]
+//
 float16x4_t test_vabs_f16(float16x4_t a) {
   return vabs_f16(a);
 }
 
-// CHECK-LABEL: test_vabsq_f16
-// CHECK:  [[ABS:%.*]] = call <8 x half> @llvm.fabs.v8f16(<8 x half> %a)
-// CHECK:  ret <8 x half> [[ABS]]
+// CHECK-LABEL: define {{[^@]+}}@test_vabsq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VABS1_I:%.*]] = call <8 x half> @llvm.fabs.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <8 x half> [[VABS1_I]]
+//
 float16x8_t test_vabsq_f16(float16x8_t a) {
   return vabsq_f16(a);
 }
 
-// CHECK-LABEL: test_vceqz_f16
-// CHECK:  [[TMP1:%.*]] = fcmp oeq <4 x half> %a, zeroinitializer
-// CHECK:  [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
-// CHECK:  ret <4 x i16> [[TMP2]]
+// CHECK-LABEL: define {{[^@]+}}@test_vceqz_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = fcmp oeq <4 x half> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[VCEQZ_I]]
+//
 uint16x4_t test_vceqz_f16(float16x4_t a) {
   return vceqz_f16(a);
 }
 
-// CHECK-LABEL: test_vceqzq_f16
-// CHECK:  [[TMP1:%.*]] = fcmp oeq <8 x half> %a, zeroinitializer
-// CHECK:  [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16>
-// CHECK:  ret <8 x i16> [[TMP2]]
+// CHECK-LABEL: define {{[^@]+}}@test_vceqzq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = fcmp oeq <8 x half> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[VCEQZ_I]]
+//
 uint16x8_t test_vceqzq_f16(float16x8_t a) {
   return vceqzq_f16(a);
 }
 
-// CHECK-LABEL: test_vcgez_f16
-// CHECK:  [[TMP1:%.*]] = fcmp oge <4 x half> %a, zeroinitializer
-// CHECK:  [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
-// CHECK:  ret <4 x i16> [[TMP2]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcgez_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = fcmp oge <4 x half> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCGEZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[VCGEZ_I]]
+//
 uint16x4_t test_vcgez_f16(float16x4_t a) {
   return vcgez_f16(a);
 }
 
-// CHECK-LABEL: test_vcgezq_f16
-// CHECK:  [[TMP1:%.*]] = fcmp oge <8 x half> %a, zeroinitializer
-// CHECK:  [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16>
-// CHECK:  ret <8 x i16> [[TMP2]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcgezq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = fcmp oge <8 x half> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCGEZ_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[VCGEZ_I]]
+//
 uint16x8_t test_vcgezq_f16(float16x8_t a) {
   return vcgezq_f16(a);
 }
 
-// CHECK-LABEL: test_vcgtz_f16
-// CHECK:  [[TMP1:%.*]] = fcmp ogt <4 x half> %a, zeroinitializer
-// CHECK:  [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
-// CHECK:  ret <4 x i16> [[TMP2]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcgtz_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = fcmp ogt <4 x half> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCGTZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[VCGTZ_I]]
+//
 uint16x4_t test_vcgtz_f16(float16x4_t a) {
   return vcgtz_f16(a);
 }
 
-// CHECK-LABEL: test_vcgtzq_f16
-// CHECK:  [[TMP1:%.*]] = fcmp ogt <8 x half> %a, zeroinitializer
-// CHECK:  [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16>
-// CHECK:  ret <8 x i16> [[TMP2]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcgtzq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = fcmp ogt <8 x half> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCGTZ_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[VCGTZ_I]]
+//
 uint16x8_t test_vcgtzq_f16(float16x8_t a) {
   return vcgtzq_f16(a);
 }
 
-// CHECK-LABEL: test_vclez_f16
-// CHECK:  [[TMP1:%.*]] = fcmp ole <4 x half> %a, zeroinitializer
-// CHECK:  [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
-// CHECK:  ret <4 x i16> [[TMP2]]
+// CHECK-LABEL: define {{[^@]+}}@test_vclez_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = fcmp ole <4 x half> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCLEZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[VCLEZ_I]]
+//
 uint16x4_t test_vclez_f16(float16x4_t a) {
   return vclez_f16(a);
 }
 
-// CHECK-LABEL: test_vclezq_f16
-// CHECK:  [[TMP1:%.*]] = fcmp ole <8 x half> %a, zeroinitializer
-// CHECK:  [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16>
-// CHECK:  ret <8 x i16> [[TMP2]]
+// CHECK-LABEL: define {{[^@]+}}@test_vclezq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = fcmp ole <8 x half> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCLEZ_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[VCLEZ_I]]
+//
 uint16x8_t test_vclezq_f16(float16x8_t a) {
   return vclezq_f16(a);
 }
 
-// CHECK-LABEL: test_vcltz_f16
-// CHECK:  [[TMP1:%.*]] = fcmp olt <4 x half> %a, zeroinitializer
-// CHECK:  [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
-// CHECK:  ret <4 x i16> [[TMP2]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcltz_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = fcmp olt <4 x half> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCLTZ_I:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[VCLTZ_I]]
+//
 uint16x4_t test_vcltz_f16(float16x4_t a) {
   return vcltz_f16(a);
 }
 
-// CHECK-LABEL: test_vcltzq_f16
-// CHECK:  [[TMP1:%.*]] = fcmp olt <8 x half> %a, zeroinitializer
-// CHECK:  [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16>
-// CHECK:  ret <8 x i16> [[TMP2]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcltzq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = fcmp olt <8 x half> [[A]], zeroinitializer
+// CHECK-NEXT:    [[VCLTZ_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[VCLTZ_I]]
+//
 uint16x8_t test_vcltzq_f16(float16x8_t a) {
   return vcltzq_f16(a);
 }
 
-// CHECK-LABEL: test_vcvt_f16_s16
-// CHECK:  [[VCVT:%.*]] = sitofp <4 x i16> %a to <4 x half>
-// CHECK:  ret <4 x half> [[VCVT]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvt_f16_s16
+// CHECK-SAME: (<4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VCVT_I:%.*]] = sitofp <4 x i16> [[A]] to <4 x half>
+// CHECK-NEXT:    ret <4 x half> [[VCVT_I]]
+//
 float16x4_t test_vcvt_f16_s16 (int16x4_t a) {
   return vcvt_f16_s16(a);
 }
 
-// CHECK-LABEL: test_vcvtq_f16_s16
-// CHECK:  [[VCVT:%.*]] = sitofp <8 x i16> %a to <8 x half>
-// CHECK:  ret <8 x half> [[VCVT]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtq_f16_s16
+// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VCVT_I:%.*]] = sitofp <8 x i16> [[A]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[VCVT_I]]
+//
 float16x8_t test_vcvtq_f16_s16 (int16x8_t a) {
   return vcvtq_f16_s16(a);
 }
 
-// CHECK-LABEL: test_vcvt_f16_u16
-// CHECK:  [[VCVT:%.*]] = uitofp <4 x i16> %a to <4 x half>
-// CHECK:  ret <4 x half> [[VCVT]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvt_f16_u16
+// CHECK-SAME: (<4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VCVT_I:%.*]] = uitofp <4 x i16> [[A]] to <4 x half>
+// CHECK-NEXT:    ret <4 x half> [[VCVT_I]]
+//
 float16x4_t test_vcvt_f16_u16 (uint16x4_t a) {
   return vcvt_f16_u16(a);
 }
 
-// CHECK-LABEL: test_vcvtq_f16_u16
-// CHECK:  [[VCVT:%.*]] = uitofp <8 x i16> %a to <8 x half>
-// CHECK:  ret <8 x half> [[VCVT]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtq_f16_u16
+// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VCVT_I:%.*]] = uitofp <8 x i16> [[A]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[VCVT_I]]
+//
 float16x8_t test_vcvtq_f16_u16 (uint16x8_t a) {
   return vcvtq_f16_u16(a);
 }
 
-// CHECK-LABEL: test_vcvt_s16_f16
-// CHECK:  [[VCVT:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtzs.v4i16.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x i16> [[VCVT]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvt_s16_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VCVTZ1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtzs.v4i16.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x i16> [[VCVTZ1_I]]
+//
 int16x4_t test_vcvt_s16_f16 (float16x4_t a) {
   return vcvt_s16_f16(a);
 }
 
-// CHECK-LABEL: test_vcvtq_s16_f16
-// CHECK:  [[VCVT:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtzs.v8i16.v8f16(<8 x half> %a)
-// CHECK:  ret <8 x i16> [[VCVT]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtq_s16_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VCVTZ1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtzs.v8i16.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <8 x i16> [[VCVTZ1_I]]
+//
 int16x8_t test_vcvtq_s16_f16 (float16x8_t a) {
   return vcvtq_s16_f16(a);
 }
 
-// CHECK-LABEL: test_vcvt_u16_f16
-// CHECK:  [[VCVT:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtzu.v4i16.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x i16> [[VCVT]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvt_u16_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VCVTZ1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtzu.v4i16.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x i16> [[VCVTZ1_I]]
+//
 uint16x4_t test_vcvt_u16_f16 (float16x4_t a) {
   return vcvt_u16_f16(a);
 }
 
-// CHECK-LABEL: test_vcvtq_u16_f16
-// CHECK:  [[VCVT:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtzu.v8i16.v8f16(<8 x half> %a)
-// CHECK:  ret <8 x i16> [[VCVT]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtq_u16_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VCVTZ1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtzu.v8i16.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <8 x i16> [[VCVTZ1_I]]
+//
 uint16x8_t test_vcvtq_u16_f16 (float16x8_t a) {
   return vcvtq_u16_f16(a);
 }
 
-// CHECK-LABEL: test_vcvta_s16_f16
-// CHECK:  [[VCVT:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtas.v4i16.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x i16> [[VCVT]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvta_s16_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VCVTA1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtas.v4i16.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x i16> [[VCVTA1_I]]
+//
 int16x4_t test_vcvta_s16_f16 (float16x4_t a) {
   return vcvta_s16_f16(a);
 }
 
-// CHECK-LABEL: test_vcvta_u16_f16
-// CHECK:  [[VCVT:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtau.v4i16.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x i16> [[VCVT]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvta_u16_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VCVTA1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtau.v4i16.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x i16> [[VCVTA1_I]]
+//
 uint16x4_t test_vcvta_u16_f16 (float16x4_t a) {
   return vcvta_u16_f16(a);
 }
 
-// CHECK-LABEL: test_vcvtaq_s16_f16
-// CHECK:  [[VCVT:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtas.v8i16.v8f16(<8 x half> %a)
-// CHECK:  ret <8 x i16> [[VCVT]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtaq_s16_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VCVTA1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtas.v8i16.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <8 x i16> [[VCVTA1_I]]
+//
 int16x8_t test_vcvtaq_s16_f16 (float16x8_t a) {
   return vcvtaq_s16_f16(a);
 }
 
-// CHECK-LABEL: test_vcvtm_s16_f16
-// CHECK:  [[VCVT:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtms.v4i16.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x i16> [[VCVT]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtm_s16_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VCVTM1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtms.v4i16.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x i16> [[VCVTM1_I]]
+//
 int16x4_t test_vcvtm_s16_f16 (float16x4_t a) {
   return vcvtm_s16_f16(a);
 }
 
-// CHECK-LABEL: test_vcvtmq_s16_f16
-// CHECK:  [[VCVT:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtms.v8i16.v8f16(<8 x half> %a)
-// CHECK:  ret <8 x i16> [[VCVT]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtmq_s16_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VCVTM1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtms.v8i16.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <8 x i16> [[VCVTM1_I]]
+//
 int16x8_t test_vcvtmq_s16_f16 (float16x8_t a) {
   return vcvtmq_s16_f16(a);
 }
 
-// CHECK-LABEL: test_vcvtm_u16_f16
-// CHECK:  [[VCVT:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtmu.v4i16.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x i16> [[VCVT]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtm_u16_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VCVTM1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtmu.v4i16.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x i16> [[VCVTM1_I]]
+//
 uint16x4_t test_vcvtm_u16_f16 (float16x4_t a) {
   return vcvtm_u16_f16(a);
 }
 
-// CHECK-LABEL: test_vcvtmq_u16_f16
-// CHECK:  [[VCVT:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtmu.v8i16.v8f16(<8 x half> %a)
-// CHECK:  ret <8 x i16> [[VCVT]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtmq_u16_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VCVTM1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtmu.v8i16.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <8 x i16> [[VCVTM1_I]]
+//
 uint16x8_t test_vcvtmq_u16_f16 (float16x8_t a) {
   return vcvtmq_u16_f16(a);
 }
 
-// CHECK-LABEL: test_vcvtn_s16_f16
-// CHECK:  [[VCVT:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtns.v4i16.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x i16> [[VCVT]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtn_s16_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VCVTN1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtns.v4i16.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x i16> [[VCVTN1_I]]
+//
 int16x4_t test_vcvtn_s16_f16 (float16x4_t a) {
   return vcvtn_s16_f16(a);
 }
 
-// CHECK-LABEL: test_vcvtnq_s16_f16
-// CHECK:  [[VCVT:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtns.v8i16.v8f16(<8 x half> %a)
-// CHECK:  ret <8 x i16> [[VCVT]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtnq_s16_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VCVTN1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtns.v8i16.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <8 x i16> [[VCVTN1_I]]
+//
 int16x8_t test_vcvtnq_s16_f16 (float16x8_t a) {
   return vcvtnq_s16_f16(a);
 }
 
-// CHECK-LABEL: test_vcvtn_u16_f16
-// CHECK:  [[VCVT:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtnu.v4i16.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x i16> [[VCVT]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtn_u16_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VCVTN1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtnu.v4i16.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x i16> [[VCVTN1_I]]
+//
 uint16x4_t test_vcvtn_u16_f16 (float16x4_t a) {
   return vcvtn_u16_f16(a);
 }
 
-// CHECK-LABEL: test_vcvtnq_u16_f16
-// CHECK:  [[VCVT:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtnu.v8i16.v8f16(<8 x half> %a)
-// CHECK:  ret <8 x i16> [[VCVT]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtnq_u16_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VCVTN1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtnu.v8i16.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <8 x i16> [[VCVTN1_I]]
+//
 uint16x8_t test_vcvtnq_u16_f16 (float16x8_t a) {
   return vcvtnq_u16_f16(a);
 }
 
-// CHECK-LABEL: test_vcvtp_s16_f16
-// CHECK:  [[VCVT:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtps.v4i16.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x i16> [[VCVT]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtp_s16_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VCVTP1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtps.v4i16.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x i16> [[VCVTP1_I]]
+//
 int16x4_t test_vcvtp_s16_f16 (float16x4_t a) {
   return vcvtp_s16_f16(a);
 }
 
-// CHECK-LABEL: test_vcvtpq_s16_f16
-// CHECK:  [[VCVT:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtps.v8i16.v8f16(<8 x half> %a)
-// CHECK:  ret <8 x i16> [[VCVT]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtpq_s16_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VCVTP1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtps.v8i16.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <8 x i16> [[VCVTP1_I]]
+//
 int16x8_t test_vcvtpq_s16_f16 (float16x8_t a) {
   return vcvtpq_s16_f16(a);
 }
 
-// CHECK-LABEL: test_vcvtp_u16_f16
-// CHECK:  [[VCVT:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtpu.v4i16.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x i16> [[VCVT]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtp_u16_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VCVTP1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtpu.v4i16.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x i16> [[VCVTP1_I]]
+//
 uint16x4_t test_vcvtp_u16_f16 (float16x4_t a) {
   return vcvtp_u16_f16(a);
 }
 
-// CHECK-LABEL: test_vcvtpq_u16_f16
-// CHECK:  [[VCVT:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtpu.v8i16.v8f16(<8 x half> %a)
-// CHECK:  ret <8 x i16> [[VCVT]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtpq_u16_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VCVTP1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtpu.v8i16.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <8 x i16> [[VCVTP1_I]]
+//
 uint16x8_t test_vcvtpq_u16_f16 (float16x8_t a) {
   return vcvtpq_u16_f16(a);
 }
 
 // FIXME: Fix the zero constant when fp16 non-storage-only type becomes available.
-// CHECK-LABEL: test_vneg_f16
-// CHECK:  [[NEG:%.*]] = fneg <4 x half> %a
-// CHECK:  ret <4 x half> [[NEG]]
+// CHECK-LABEL: define {{[^@]+}}@test_vneg_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <4 x half> [[A]]
+// CHECK-NEXT:    ret <4 x half> [[FNEG_I]]
+//
 float16x4_t test_vneg_f16(float16x4_t a) {
   return vneg_f16(a);
 }
 
-// CHECK-LABEL: test_vnegq_f16
-// CHECK:  [[NEG:%.*]] = fneg <8 x half> %a
-// CHECK:  ret <8 x half> [[NEG]]
+// CHECK-LABEL: define {{[^@]+}}@test_vnegq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <8 x half> [[A]]
+// CHECK-NEXT:    ret <8 x half> [[FNEG_I]]
+//
 float16x8_t test_vnegq_f16(float16x8_t a) {
   return vnegq_f16(a);
 }
 
-// CHECK-LABEL: test_vrecpe_f16
-// CHECK:  [[RCP:%.*]] = call <4 x half> @llvm.aarch64.neon.frecpe.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x half> [[RCP]]
+// CHECK-LABEL: define {{[^@]+}}@test_vrecpe_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VRECPE_V1_I:%.*]] = call <4 x half> @llvm.aarch64.neon.frecpe.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x half> [[VRECPE_V1_I]]
+//
 float16x4_t test_vrecpe_f16(float16x4_t a) {
   return vrecpe_f16(a);
 }
 
-// CHECK-LABEL: test_vrecpeq_f16
-// CHECK:  [[RCP:%.*]] = call <8 x half> @llvm.aarch64.neon.frecpe.v8f16(<8 x half> %a)
-// CHECK:  ret <8 x half> [[RCP]]
+// CHECK-LABEL: define {{[^@]+}}@test_vrecpeq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VRECPEQ_V1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.frecpe.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <8 x half> [[VRECPEQ_V1_I]]
+//
 float16x8_t test_vrecpeq_f16(float16x8_t a) {
   return vrecpeq_f16(a);
 }
 
-// CHECK-LABEL: test_vrnd_f16
-// CHECK:  [[RND:%.*]] =  call <4 x half> @llvm.trunc.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x half> [[RND]]
+// CHECK-LABEL: define {{[^@]+}}@test_vrnd_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VRNDZ1_I:%.*]] = call <4 x half> @llvm.trunc.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x half> [[VRNDZ1_I]]
+//
 float16x4_t test_vrnd_f16(float16x4_t a) {
   return vrnd_f16(a);
 }
 
-// CHECK-LABEL: test_vrndq_f16
-// CHECK:  [[RND:%.*]] =  call <8 x half> @llvm.trunc.v8f16(<8 x half> %a)
-// CHECK:  ret <8 x half> [[RND]]
+// CHECK-LABEL: define {{[^@]+}}@test_vrndq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VRNDZ1_I:%.*]] = call <8 x half> @llvm.trunc.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <8 x half> [[VRNDZ1_I]]
+//
 float16x8_t test_vrndq_f16(float16x8_t a) {
   return vrndq_f16(a);
 }
 
-// CHECK-LABEL: test_vrnda_f16
-// CHECK:  [[RND:%.*]] =  call <4 x half> @llvm.round.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x half> [[RND]]
+// CHECK-LABEL: define {{[^@]+}}@test_vrnda_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VRNDA1_I:%.*]] = call <4 x half> @llvm.round.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x half> [[VRNDA1_I]]
+//
 float16x4_t test_vrnda_f16(float16x4_t a) {
   return vrnda_f16(a);
 }
 
-// CHECK-LABEL: test_vrndaq_f16
-// CHECK:  [[RND:%.*]] =  call <8 x half> @llvm.round.v8f16(<8 x half> %a)
-// CHECK:  ret <8 x half> [[RND]]
+// CHECK-LABEL: define {{[^@]+}}@test_vrndaq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VRNDA1_I:%.*]] = call <8 x half> @llvm.round.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <8 x half> [[VRNDA1_I]]
+//
 float16x8_t test_vrndaq_f16(float16x8_t a) {
   return vrndaq_f16(a);
 }
 
-// CHECK-LABEL: test_vrndi_f16
-// CHECK:  [[RND:%.*]] =  call <4 x half> @llvm.nearbyint.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x half> [[RND]]
+// CHECK-LABEL: define {{[^@]+}}@test_vrndi_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VRNDI_V1_I:%.*]] = call <4 x half> @llvm.nearbyint.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x half> [[VRNDI_V1_I]]
+//
 float16x4_t test_vrndi_f16(float16x4_t a) {
   return vrndi_f16(a);
 }
 
-// CHECK-LABEL: test_vrndiq_f16
-// CHECK:  [[RND:%.*]] =  call <8 x half> @llvm.nearbyint.v8f16(<8 x half> %a)
-// CHECK:  ret <8 x half> [[RND]]
+// CHECK-LABEL: define {{[^@]+}}@test_vrndiq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VRNDIQ_V1_I:%.*]] = call <8 x half> @llvm.nearbyint.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <8 x half> [[VRNDIQ_V1_I]]
+//
 float16x8_t test_vrndiq_f16(float16x8_t a) {
   return vrndiq_f16(a);
 }
 
-// CHECK-LABEL: test_vrndm_f16
-// CHECK:  [[RND:%.*]] =  call <4 x half> @llvm.floor.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x half> [[RND]]
+// CHECK-LABEL: define {{[^@]+}}@test_vrndm_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VRNDM1_I:%.*]] = call <4 x half> @llvm.floor.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x half> [[VRNDM1_I]]
+//
 float16x4_t test_vrndm_f16(float16x4_t a) {
   return vrndm_f16(a);
 }
 
-// CHECK-LABEL: test_vrndmq_f16
-// CHECK:  [[RND:%.*]] =  call <8 x half> @llvm.floor.v8f16(<8 x half> %a)
-// CHECK:  ret <8 x half> [[RND]]
+// CHECK-LABEL: define {{[^@]+}}@test_vrndmq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VRNDM1_I:%.*]] = call <8 x half> @llvm.floor.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <8 x half> [[VRNDM1_I]]
+//
 float16x8_t test_vrndmq_f16(float16x8_t a) {
   return vrndmq_f16(a);
 }
 
-// CHECK-LABEL: test_vrndn_f16
-// CHECK:  [[RND:%.*]] =  call <4 x half> @llvm.roundeven.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x half> [[RND]]
+// CHECK-LABEL: define {{[^@]+}}@test_vrndn_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VRNDN1_I:%.*]] = call <4 x half> @llvm.roundeven.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x half> [[VRNDN1_I]]
+//
 float16x4_t test_vrndn_f16(float16x4_t a) {
   return vrndn_f16(a);
 }
 
-// CHECK-LABEL: test_vrndnq_f16
-// CHECK:  [[RND:%.*]] =  call <8 x half> @llvm.roundeven.v8f16(<8 x half> %a)
-// CHECK:  ret <8 x half> [[RND]]
+// CHECK-LABEL: define {{[^@]+}}@test_vrndnq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VRNDN1_I:%.*]] = call <8 x half> @llvm.roundeven.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <8 x half> [[VRNDN1_I]]
+//
 float16x8_t test_vrndnq_f16(float16x8_t a) {
   return vrndnq_f16(a);
 }
 
-// CHECK-LABEL: test_vrndp_f16
-// CHECK:  [[RND:%.*]] =  call <4 x half> @llvm.ceil.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x half> [[RND]]
+// CHECK-LABEL: define {{[^@]+}}@test_vrndp_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VRNDP1_I:%.*]] = call <4 x half> @llvm.ceil.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x half> [[VRNDP1_I]]
+//
 float16x4_t test_vrndp_f16(float16x4_t a) {
   return vrndp_f16(a);
 }
 
-// CHECK-LABEL: test_vrndpq_f16
-// CHECK:  [[RND:%.*]] =  call <8 x half> @llvm.ceil.v8f16(<8 x half> %a)
-// CHECK:  ret <8 x half> [[RND]]
+// CHECK-LABEL: define {{[^@]+}}@test_vrndpq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VRNDP1_I:%.*]] = call <8 x half> @llvm.ceil.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <8 x half> [[VRNDP1_I]]
+//
 float16x8_t test_vrndpq_f16(float16x8_t a) {
   return vrndpq_f16(a);
 }
 
-// CHECK-LABEL: test_vrndx_f16
-// CHECK:  [[RND:%.*]] =  call <4 x half> @llvm.rint.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x half> [[RND]]
+// CHECK-LABEL: define {{[^@]+}}@test_vrndx_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VRNDX1_I:%.*]] = call <4 x half> @llvm.rint.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x half> [[VRNDX1_I]]
+//
 float16x4_t test_vrndx_f16(float16x4_t a) {
   return vrndx_f16(a);
 }
 
-// CHECK-LABEL: test_vrndxq_f16
-// CHECK:  [[RND:%.*]] =  call <8 x half> @llvm.rint.v8f16(<8 x half> %a)
-// CHECK:  ret <8 x half> [[RND]]
+// CHECK-LABEL: define {{[^@]+}}@test_vrndxq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VRNDX1_I:%.*]] = call <8 x half> @llvm.rint.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <8 x half> [[VRNDX1_I]]
+//
 float16x8_t test_vrndxq_f16(float16x8_t a) {
   return vrndxq_f16(a);
 }
 
-// CHECK-LABEL: test_vrsqrte_f16
-// CHECK:  [[RND:%.*]] = call <4 x half> @llvm.aarch64.neon.frsqrte.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x half> [[RND]]
+// CHECK-LABEL: define {{[^@]+}}@test_vrsqrte_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VRSQRTE_V1_I:%.*]] = call <4 x half> @llvm.aarch64.neon.frsqrte.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x half> [[VRSQRTE_V1_I]]
+//
 float16x4_t test_vrsqrte_f16(float16x4_t a) {
   return vrsqrte_f16(a);
 }
 
-// CHECK-LABEL: test_vrsqrteq_f16
-// CHECK:  [[RND:%.*]] = call <8 x half> @llvm.aarch64.neon.frsqrte.v8f16(<8 x half> %a)
-// CHECK:  ret <8 x half> [[RND]]
+// CHECK-LABEL: define {{[^@]+}}@test_vrsqrteq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VRSQRTEQ_V1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.frsqrte.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <8 x half> [[VRSQRTEQ_V1_I]]
+//
 float16x8_t test_vrsqrteq_f16(float16x8_t a) {
   return vrsqrteq_f16(a);
 }
 
-// CHECK-LABEL: test_vsqrt_f16
-// CHECK:  [[SQR:%.*]] = call <4 x half> @llvm.sqrt.v4f16(<4 x half> %a)
-// CHECK:  ret <4 x half> [[SQR]]
+// CHECK-LABEL: define {{[^@]+}}@test_vsqrt_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VSQRT_I:%.*]] = call <4 x half> @llvm.sqrt.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x half> [[VSQRT_I]]
+//
 float16x4_t test_vsqrt_f16(float16x4_t a) {
   return vsqrt_f16(a);
 }
 
-// CHECK-LABEL: test_vsqrtq_f16
-// CHECK:  [[SQR:%.*]] = call <8 x half> @llvm.sqrt.v8f16(<8 x half> %a)
-// CHECK:  ret <8 x half> [[SQR]]
+// CHECK-LABEL: define {{[^@]+}}@test_vsqrtq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VSQRT_I:%.*]] = call <8 x half> @llvm.sqrt.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <8 x half> [[VSQRT_I]]
+//
 float16x8_t test_vsqrtq_f16(float16x8_t a) {
   return vsqrtq_f16(a);
 }
 
-// CHECK-LABEL: test_vadd_f16
-// CHECK:  [[ADD:%.*]] = fadd <4 x half> %a, %b
-// CHECK:  ret <4 x half> [[ADD]]
+// CHECK-LABEL: define {{[^@]+}}@test_vadd_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD_I:%.*]] = fadd <4 x half> [[A]], [[B]]
+// CHECK-NEXT:    ret <4 x half> [[ADD_I]]
+//
 float16x4_t test_vadd_f16(float16x4_t a, float16x4_t b) {
   return vadd_f16(a, b);
 }
 
-// CHECK-LABEL: test_vaddq_f16
-// CHECK:  [[ADD:%.*]] = fadd <8 x half> %a, %b
-// CHECK:  ret <8 x half> [[ADD]]
+// CHECK-LABEL: define {{[^@]+}}@test_vaddq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[ADD_I:%.*]] = fadd <8 x half> [[A]], [[B]]
+// CHECK-NEXT:    ret <8 x half> [[ADD_I]]
+//
 float16x8_t test_vaddq_f16(float16x8_t a, float16x8_t b) {
   return vaddq_f16(a, b);
 }
 
-// CHECK-LABEL: test_vabd_f16
-// CHECK:  [[ABD:%.*]] = call <4 x half> @llvm.aarch64.neon.fabd.v4f16(<4 x half> %a, <4 x half> %b)
-// CHECK:  ret <4 x half> [[ABD]]
+// CHECK-LABEL: define {{[^@]+}}@test_vabd_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VABD2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fabd.v4f16(<4 x half> [[A]], <4 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x half> [[VABD2_I]]
+//
 float16x4_t test_vabd_f16(float16x4_t a, float16x4_t b) {
   return vabd_f16(a, b);
 }
 
-// CHECK-LABEL: test_vabdq_f16
-// CHECK:  [[ABD:%.*]] = call <8 x half> @llvm.aarch64.neon.fabd.v8f16(<8 x half> %a, <8 x half> %b)
-// CHECK:  ret <8 x half> [[ABD]]
+// CHECK-LABEL: define {{[^@]+}}@test_vabdq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VABD2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fabd.v8f16(<8 x half> [[A]], <8 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <8 x half> [[VABD2_I]]
+//
 float16x8_t test_vabdq_f16(float16x8_t a, float16x8_t b) {
   return vabdq_f16(a, b);
 }
 
-// CHECK-LABEL: test_vcage_f16
-// CHECK:  [[ABS:%.*]] = call <4 x i16> @llvm.aarch64.neon.facge.v4i16.v4f16(<4 x half> %a, <4 x half> %b)
-// CHECK:  ret <4 x i16> [[ABS]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcage_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VCAGE_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.facge.v4i16.v4f16(<4 x half> [[A]], <4 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x i16> [[VCAGE_V2_I]]
+//
 uint16x4_t test_vcage_f16(float16x4_t a, float16x4_t b) {
   return vcage_f16(a, b);
 }
 
-// CHECK-LABEL: test_vcageq_f16
-// CHECK:  [[ABS:%.*]] = call <8 x i16> @llvm.aarch64.neon.facge.v8i16.v8f16(<8 x half> %a, <8 x half> %b)
-// CHECK:  ret <8 x i16> [[ABS]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcageq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VCAGEQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.facge.v8i16.v8f16(<8 x half> [[A]], <8 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <8 x i16> [[VCAGEQ_V2_I]]
+//
 uint16x8_t test_vcageq_f16(float16x8_t a, float16x8_t b) {
   return vcageq_f16(a, b);
 }
 
-// CHECK-LABEL: test_vcagt_f16
-// CHECK:  [[ABS:%.*]] = call <4 x i16> @llvm.aarch64.neon.facgt.v4i16.v4f16(<4 x half> %a, <4 x half> %b)
-// CHECK:  ret <4 x i16> [[ABS]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcagt_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VCAGT_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.facgt.v4i16.v4f16(<4 x half> [[A]], <4 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x i16> [[VCAGT_V2_I]]
+//
 uint16x4_t test_vcagt_f16(float16x4_t a, float16x4_t b) {
   return vcagt_f16(a, b);
 }
 
-// CHECK-LABEL: test_vcagtq_f16
-// CHECK:  [[ABS:%.*]] = call <8 x i16> @llvm.aarch64.neon.facgt.v8i16.v8f16(<8 x half> %a, <8 x half> %b)
-// CHECK:  ret <8 x i16> [[ABS]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcagtq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VCAGTQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.facgt.v8i16.v8f16(<8 x half> [[A]], <8 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <8 x i16> [[VCAGTQ_V2_I]]
+//
 uint16x8_t test_vcagtq_f16(float16x8_t a, float16x8_t b) {
   return vcagtq_f16(a, b);
 }
 
-// CHECK-LABEL: test_vcale_f16
-// CHECK:  [[ABS:%.*]] = call <4 x i16> @llvm.aarch64.neon.facge.v4i16.v4f16(<4 x half> %b, <4 x half> %a)
-// CHECK:  ret <4 x i16> [[ABS]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcale_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VCALE_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.facge.v4i16.v4f16(<4 x half> [[B]], <4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x i16> [[VCALE_V2_I]]
+//
 uint16x4_t test_vcale_f16(float16x4_t a, float16x4_t b) {
   return vcale_f16(a, b);
 }
 
-// CHECK-LABEL: test_vcaleq_f16
-// CHECK:  [[ABS:%.*]] = call <8 x i16> @llvm.aarch64.neon.facge.v8i16.v8f16(<8 x half> %b, <8 x half> %a)
-// CHECK:  ret <8 x i16> [[ABS]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcaleq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VCALEQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.facge.v8i16.v8f16(<8 x half> [[B]], <8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <8 x i16> [[VCALEQ_V2_I]]
+//
 uint16x8_t test_vcaleq_f16(float16x8_t a, float16x8_t b) {
   return vcaleq_f16(a, b);
 }
 
-// CHECK-LABEL: test_vcalt_f16
-// CHECK:  [[ABS:%.*]] = call <4 x i16> @llvm.aarch64.neon.facgt.v4i16.v4f16(<4 x half> %b, <4 x half> %a)
-// CHECK:  ret <4 x i16> [[ABS]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcalt_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VCALT_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.facgt.v4i16.v4f16(<4 x half> [[B]], <4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x i16> [[VCALT_V2_I]]
+//
 uint16x4_t test_vcalt_f16(float16x4_t a, float16x4_t b) {
   return vcalt_f16(a, b);
 }
 
-// CHECK-LABEL: test_vcaltq_f16
-// CHECK:  [[ABS:%.*]] = call <8 x i16> @llvm.aarch64.neon.facgt.v8i16.v8f16(<8 x half> %b, <8 x half> %a)
-// CHECK:  ret <8 x i16> [[ABS]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcaltq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VCALTQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.facgt.v8i16.v8f16(<8 x half> [[B]], <8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <8 x i16> [[VCALTQ_V2_I]]
+//
 uint16x8_t test_vcaltq_f16(float16x8_t a, float16x8_t b) {
   return vcaltq_f16(a, b);
 }
 
-// CHECK-LABEL: test_vceq_f16
-// CHECK:  [[TMP1:%.*]] = fcmp oeq <4 x half> %a, %b
-// CHECK:  [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
-// CHECK:  ret <4 x i16> [[TMP2]]
+// CHECK-LABEL: define {{[^@]+}}@test_vceq_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[CMP_I:%.*]] = fcmp oeq <4 x half> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[SEXT_I]]
+//
 uint16x4_t test_vceq_f16(float16x4_t a, float16x4_t b) {
   return vceq_f16(a, b);
 }
 
-// CHECK-LABEL: test_vceqq_f16
-// CHECK:  [[TMP1:%.*]] = fcmp oeq <8 x half> %a, %b
-// CHECK:  [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16>
-// CHECK:  ret <8 x i16> [[TMP2]]
+// CHECK-LABEL: define {{[^@]+}}@test_vceqq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[CMP_I:%.*]] = fcmp oeq <8 x half> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[SEXT_I]]
+//
 uint16x8_t test_vceqq_f16(float16x8_t a, float16x8_t b) {
   return vceqq_f16(a, b);
 }
 
-// CHECK-LABEL: test_vcge_f16
-// CHECK:  [[TMP1:%.*]] = fcmp oge <4 x half> %a, %b
-// CHECK:  [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
-// CHECK:  ret <4 x i16> [[TMP2]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcge_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[CMP_I:%.*]] = fcmp oge <4 x half> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[SEXT_I]]
+//
 uint16x4_t test_vcge_f16(float16x4_t a, float16x4_t b) {
   return vcge_f16(a, b);
 }
 
-// CHECK-LABEL: test_vcgeq_f16
-// CHECK:  [[TMP1:%.*]] = fcmp oge <8 x half> %a, %b
-// CHECK:  [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16>
-// CHECK:  ret <8 x i16> [[TMP2]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcgeq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[CMP_I:%.*]] = fcmp oge <8 x half> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[SEXT_I]]
+//
 uint16x8_t test_vcgeq_f16(float16x8_t a, float16x8_t b) {
   return vcgeq_f16(a, b);
 }
 
-// CHECK-LABEL: test_vcgt_f16
-// CHECK:  [[TMP1:%.*]] = fcmp ogt <4 x half> %a, %b
-// CHECK:  [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
-// CHECK:  ret <4 x i16> [[TMP2]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcgt_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[CMP_I:%.*]] = fcmp ogt <4 x half> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[SEXT_I]]
+//
 uint16x4_t test_vcgt_f16(float16x4_t a, float16x4_t b) {
   return vcgt_f16(a, b);
 }
 
-// CHECK-LABEL: test_vcgtq_f16
-// CHECK:  [[TMP1:%.*]] = fcmp ogt <8 x half> %a, %b
-// CHECK:  [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16>
-// CHECK:  ret <8 x i16> [[TMP2]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcgtq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[CMP_I:%.*]] = fcmp ogt <8 x half> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[SEXT_I]]
+//
 uint16x8_t test_vcgtq_f16(float16x8_t a, float16x8_t b) {
   return vcgtq_f16(a, b);
 }
 
-// CHECK-LABEL: test_vcle_f16
-// CHECK:  [[TMP1:%.*]] = fcmp ole <4 x half> %a, %b
-// CHECK:  [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
-// CHECK:  ret <4 x i16> [[TMP2]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcle_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[CMP_I:%.*]] = fcmp ole <4 x half> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[SEXT_I]]
+//
 uint16x4_t test_vcle_f16(float16x4_t a, float16x4_t b) {
   return vcle_f16(a, b);
 }
 
-// CHECK-LABEL: test_vcleq_f16
-// CHECK:  [[TMP1:%.*]] = fcmp ole <8 x half> %a, %b
-// CHECK:  [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16>
-// CHECK:  ret <8 x i16> [[TMP2]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcleq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[CMP_I:%.*]] = fcmp ole <8 x half> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[SEXT_I]]
+//
 uint16x8_t test_vcleq_f16(float16x8_t a, float16x8_t b) {
   return vcleq_f16(a, b);
 }
 
-// CHECK-LABEL: test_vclt_f16
-// CHECK:  [[TMP1:%.*]] = fcmp olt <4 x half> %a, %b
-// CHECK:  [[TMP2:%.*]] = sext <4 x i1> [[TMP1]] to <4 x i16>
-// CHECK:  ret <4 x i16> [[TMP2]]
+// CHECK-LABEL: define {{[^@]+}}@test_vclt_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[CMP_I:%.*]] = fcmp olt <4 x half> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[SEXT_I]]
+//
 uint16x4_t test_vclt_f16(float16x4_t a, float16x4_t b) {
   return vclt_f16(a, b);
 }
 
-// CHECK-LABEL: test_vcltq_f16
-// CHECK:  [[TMP1:%.*]] = fcmp olt <8 x half> %a, %b
-// CHECK:  [[TMP2:%.*]] = sext <8 x i1> [[TMP1:%.*]] to <8 x i16>
-// CHECK:  ret <8 x i16> [[TMP2]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcltq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[CMP_I:%.*]] = fcmp olt <8 x half> [[A]], [[B]]
+// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[SEXT_I]]
+//
 uint16x8_t test_vcltq_f16(float16x8_t a, float16x8_t b) {
   return vcltq_f16(a, b);
 }
 
-// CHECK-LABEL: test_vcvt_n_f16_s16
-// CHECK:  [[CVT:%.*]] = call <4 x half> @llvm.aarch64.neon.vcvtfxs2fp.v4f16.v4i16(<4 x i16> %vcvt_n, i32 2)
-// CHECK:  ret <4 x half> [[CVT]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvt_n_f16_s16
+// CHECK-SAME: (<4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <4 x half> @llvm.aarch64.neon.vcvtfxs2fp.v4f16.v4i16(<4 x i16> [[VCVT_N]], i32 2)
+// CHECK-NEXT:    ret <4 x half> [[VCVT_N1]]
+//
 float16x4_t test_vcvt_n_f16_s16(int16x4_t a) {
   return vcvt_n_f16_s16(a, 2);
 }
 
-// CHECK-LABEL: test_vcvtq_n_f16_s16
-// CHECK:  [[CVT:%.*]] = call <8 x half> @llvm.aarch64.neon.vcvtfxs2fp.v8f16.v8i16(<8 x i16> %vcvt_n, i32 2)
-// CHECK:  ret <8 x half> [[CVT]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtq_n_f16_s16
+// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <8 x half> @llvm.aarch64.neon.vcvtfxs2fp.v8f16.v8i16(<8 x i16> [[VCVT_N]], i32 2)
+// CHECK-NEXT:    ret <8 x half> [[VCVT_N1]]
+//
 float16x8_t test_vcvtq_n_f16_s16(int16x8_t a) {
   return vcvtq_n_f16_s16(a, 2);
 }
 
-// CHECK-LABEL: test_vcvt_n_f16_u16
-// CHECK:  [[CVT:%.*]] = call <4 x half> @llvm.aarch64.neon.vcvtfxu2fp.v4f16.v4i16(<4 x i16> %vcvt_n, i32 2)
-// CHECK:  ret <4 x half> [[CVT]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvt_n_f16_u16
+// CHECK-SAME: (<4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <4 x half> @llvm.aarch64.neon.vcvtfxu2fp.v4f16.v4i16(<4 x i16> [[VCVT_N]], i32 2)
+// CHECK-NEXT:    ret <4 x half> [[VCVT_N1]]
+//
 float16x4_t test_vcvt_n_f16_u16(uint16x4_t a) {
   return vcvt_n_f16_u16(a, 2);
 }
 
-// CHECK-LABEL: test_vcvtq_n_f16_u16
-// CHECK:  [[CVT:%.*]] = call <8 x half> @llvm.aarch64.neon.vcvtfxu2fp.v8f16.v8i16(<8 x i16> %vcvt_n, i32 2)
-// CHECK:  ret <8 x half> [[CVT]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtq_n_f16_u16
+// CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <8 x half> @llvm.aarch64.neon.vcvtfxu2fp.v8f16.v8i16(<8 x i16> [[VCVT_N]], i32 2)
+// CHECK-NEXT:    ret <8 x half> [[VCVT_N1]]
+//
 float16x8_t test_vcvtq_n_f16_u16(uint16x8_t a) {
   return vcvtq_n_f16_u16(a, 2);
 }
 
-// CHECK-LABEL: test_vcvt_n_s16_f16
-// CHECK:  [[CVT:%.*]] = call <4 x i16> @llvm.aarch64.neon.vcvtfp2fxs.v4i16.v4f16(<4 x half> %vcvt_n, i32 2)
-// CHECK:  ret <4 x i16> [[CVT]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvt_n_s16_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.vcvtfp2fxs.v4i16.v4f16(<4 x half> [[VCVT_N]], i32 2)
+// CHECK-NEXT:    ret <4 x i16> [[VCVT_N1]]
+//
 int16x4_t test_vcvt_n_s16_f16(float16x4_t a) {
   return vcvt_n_s16_f16(a, 2);
 }
 
-// CHECK-LABEL: test_vcvtq_n_s16_f16
-// CHECK:  [[CVT:%.*]] = call <8 x i16> @llvm.aarch64.neon.vcvtfp2fxs.v8i16.v8f16(<8 x half> %vcvt_n, i32 2)
-// CHECK:  ret <8 x i16> [[CVT]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtq_n_s16_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.vcvtfp2fxs.v8i16.v8f16(<8 x half> [[VCVT_N]], i32 2)
+// CHECK-NEXT:    ret <8 x i16> [[VCVT_N1]]
+//
 int16x8_t test_vcvtq_n_s16_f16(float16x8_t a) {
   return vcvtq_n_s16_f16(a, 2);
 }
 
-// CHECK-LABEL: test_vcvt_n_u16_f16
-// CHECK:  [[CVT:%.*]] = call <4 x i16> @llvm.aarch64.neon.vcvtfp2fxu.v4i16.v4f16(<4 x half> %vcvt_n, i32 2)
-// CHECK:  ret <4 x i16> [[CVT]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvt_n_u16_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.vcvtfp2fxu.v4i16.v4f16(<4 x half> [[VCVT_N]], i32 2)
+// CHECK-NEXT:    ret <4 x i16> [[VCVT_N1]]
+//
 uint16x4_t test_vcvt_n_u16_f16(float16x4_t a) {
   return vcvt_n_u16_f16(a, 2);
 }
 
-// CHECK-LABEL: test_vcvtq_n_u16_f16
-// CHECK:  [[CVT:%.*]] = call <8 x i16> @llvm.aarch64.neon.vcvtfp2fxu.v8i16.v8f16(<8 x half> %vcvt_n, i32 2)
-// CHECK:  ret <8 x i16> [[CVT]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtq_n_u16_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
+// CHECK-NEXT:    [[VCVT_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.vcvtfp2fxu.v8i16.v8f16(<8 x half> [[VCVT_N]], i32 2)
+// CHECK-NEXT:    ret <8 x i16> [[VCVT_N1]]
+//
 uint16x8_t test_vcvtq_n_u16_f16(float16x8_t a) {
   return vcvtq_n_u16_f16(a, 2);
 }
 
-// CHECK-LABEL: test_vdiv_f16
-// CHECK:  [[DIV:%.*]] = fdiv <4 x half> %a, %b
-// CHECK:  ret <4 x half> [[DIV]]
+// CHECK-LABEL: define {{[^@]+}}@test_vdiv_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DIV_I:%.*]] = fdiv <4 x half> [[A]], [[B]]
+// CHECK-NEXT:    ret <4 x half> [[DIV_I]]
+//
 float16x4_t test_vdiv_f16(float16x4_t a, float16x4_t b) {
   return vdiv_f16(a, b);
 }
 
-// CHECK-LABEL: test_vdivq_f16
-// CHECK:  [[DIV:%.*]] = fdiv <8 x half> %a, %b
-// CHECK:  ret <8 x half> [[DIV]]
+// CHECK-LABEL: define {{[^@]+}}@test_vdivq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DIV_I:%.*]] = fdiv <8 x half> [[A]], [[B]]
+// CHECK-NEXT:    ret <8 x half> [[DIV_I]]
+//
 float16x8_t test_vdivq_f16(float16x8_t a, float16x8_t b) {
   return vdivq_f16(a, b);
 }
 
-// CHECK-LABEL: test_vmax_f16
-// CHECK:  [[MAX:%.*]] = call <4 x half> @llvm.aarch64.neon.fmax.v4f16(<4 x half> %a, <4 x half> %b)
-// CHECK:  ret <4 x half> [[MAX]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmax_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VMAX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmax.v4f16(<4 x half> [[A]], <4 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x half> [[VMAX2_I]]
+//
 float16x4_t test_vmax_f16(float16x4_t a, float16x4_t b) {
   return vmax_f16(a, b);
 }
 
-// CHECK-LABEL: test_vmaxq_f16
-// CHECK:  [[MAX:%.*]] = call <8 x half> @llvm.aarch64.neon.fmax.v8f16(<8 x half> %a, <8 x half> %b)
-// CHECK:  ret <8 x half> [[MAX]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmaxq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VMAX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmax.v8f16(<8 x half> [[A]], <8 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <8 x half> [[VMAX2_I]]
+//
 float16x8_t test_vmaxq_f16(float16x8_t a, float16x8_t b) {
   return vmaxq_f16(a, b);
 }
 
-// CHECK-LABEL: test_vmaxnm_f16
-// CHECK:  [[MAX:%.*]] = call <4 x half> @llvm.aarch64.neon.fmaxnm.v4f16(<4 x half> %a, <4 x half> %b)
-// CHECK:  ret <4 x half> [[MAX]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmaxnm_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VMAXNM2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmaxnm.v4f16(<4 x half> [[A]], <4 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x half> [[VMAXNM2_I]]
+//
 float16x4_t test_vmaxnm_f16(float16x4_t a, float16x4_t b) {
   return vmaxnm_f16(a, b);
 }
 
-// CHECK-LABEL: test_vmaxnmq_f16
-// CHECK:  [[MAX:%.*]] = call <8 x half> @llvm.aarch64.neon.fmaxnm.v8f16(<8 x half> %a, <8 x half> %b)
-// CHECK:  ret <8 x half> [[MAX]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmaxnmq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VMAXNM2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmaxnm.v8f16(<8 x half> [[A]], <8 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <8 x half> [[VMAXNM2_I]]
+//
 float16x8_t test_vmaxnmq_f16(float16x8_t a, float16x8_t b) {
   return vmaxnmq_f16(a, b);
 }
 
-// CHECK-LABEL: test_vmin_f16
-// CHECK:  [[MIN:%.*]] = call <4 x half> @llvm.aarch64.neon.fmin.v4f16(<4 x half> %a, <4 x half> %b)
-// CHECK:  ret <4 x half> [[MIN]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmin_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VMIN2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmin.v4f16(<4 x half> [[A]], <4 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x half> [[VMIN2_I]]
+//
 float16x4_t test_vmin_f16(float16x4_t a, float16x4_t b) {
   return vmin_f16(a, b);
 }
 
-// CHECK-LABEL: test_vminq_f16
-// CHECK:  [[MIN:%.*]] = call <8 x half> @llvm.aarch64.neon.fmin.v8f16(<8 x half> %a, <8 x half> %b)
-// CHECK:  ret <8 x half> [[MIN]]
+// CHECK-LABEL: define {{[^@]+}}@test_vminq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VMIN2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmin.v8f16(<8 x half> [[A]], <8 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <8 x half> [[VMIN2_I]]
+//
 float16x8_t test_vminq_f16(float16x8_t a, float16x8_t b) {
   return vminq_f16(a, b);
 }
 
-// CHECK-LABEL: test_vminnm_f16
-// CHECK:  [[MIN:%.*]] = call <4 x half> @llvm.aarch64.neon.fminnm.v4f16(<4 x half> %a, <4 x half> %b)
-// CHECK:  ret <4 x half> [[MIN]]
+// CHECK-LABEL: define {{[^@]+}}@test_vminnm_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VMINNM2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fminnm.v4f16(<4 x half> [[A]], <4 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x half> [[VMINNM2_I]]
+//
 float16x4_t test_vminnm_f16(float16x4_t a, float16x4_t b) {
   return vminnm_f16(a, b);
 }
 
-// CHECK-LABEL: test_vminnmq_f16
-// CHECK:  [[MIN:%.*]] = call <8 x half> @llvm.aarch64.neon.fminnm.v8f16(<8 x half> %a, <8 x half> %b)
-// CHECK:  ret <8 x half> [[MIN]]
+// CHECK-LABEL: define {{[^@]+}}@test_vminnmq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VMINNM2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fminnm.v8f16(<8 x half> [[A]], <8 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <8 x half> [[VMINNM2_I]]
+//
 float16x8_t test_vminnmq_f16(float16x8_t a, float16x8_t b) {
   return vminnmq_f16(a, b);
 }
 
-// CHECK-LABEL: test_vmul_f16
-// CHECK:  [[MUL:%.*]] = fmul <4 x half> %a, %b
-// CHECK:  ret <4 x half> [[MUL]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmul_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[MUL_I:%.*]] = fmul <4 x half> [[A]], [[B]]
+// CHECK-NEXT:    ret <4 x half> [[MUL_I]]
+//
 float16x4_t test_vmul_f16(float16x4_t a, float16x4_t b) {
   return vmul_f16(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_f16
-// CHECK:  [[MUL:%.*]] = fmul <8 x half> %a, %b
-// CHECK:  ret <8 x half> [[MUL]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmulq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[MUL_I:%.*]] = fmul <8 x half> [[A]], [[B]]
+// CHECK-NEXT:    ret <8 x half> [[MUL_I]]
+//
 float16x8_t test_vmulq_f16(float16x8_t a, float16x8_t b) {
   return vmulq_f16(a, b);
 }
 
-// CHECK-LABEL: test_vmulx_f16
-// CHECK:  [[MUL:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> %a, <4 x half> %b)
-// CHECK:  ret <4 x half> [[MUL]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmulx_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> [[A]], <4 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x half> [[VMULX2_I]]
+//
 float16x4_t test_vmulx_f16(float16x4_t a, float16x4_t b) {
   return vmulx_f16(a, b);
 }
 
-// CHECK-LABEL: test_vmulxq_f16
-// CHECK:  [[MUL:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> %a, <8 x half> %b)
-// CHECK:  ret <8 x half> [[MUL]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmulxq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> [[A]], <8 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <8 x half> [[VMULX2_I]]
+//
 float16x8_t test_vmulxq_f16(float16x8_t a, float16x8_t b) {
   return vmulxq_f16(a, b);
 }
 
-// CHECK-LABEL: test_vpadd_f16
-// CHECK:  [[ADD:%.*]] = call <4 x half> @llvm.aarch64.neon.faddp.v4f16(<4 x half> %a, <4 x half> %b)
-// CHECK:  ret <4 x half> [[ADD]]
+// CHECK-LABEL: define {{[^@]+}}@test_vpadd_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VPADD_V2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.faddp.v4f16(<4 x half> [[A]], <4 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VPADD_V3_I:%.*]] = bitcast <4 x half> [[VPADD_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x half> [[VPADD_V2_I]]
+//
 float16x4_t test_vpadd_f16(float16x4_t a, float16x4_t b) {
   return vpadd_f16(a, b);
 }
 
-// CHECK-LABEL: test_vpaddq_f16
-// CHECK:  [[ADD:%.*]] = call <8 x half> @llvm.aarch64.neon.faddp.v8f16(<8 x half> %a, <8 x half> %b)
-// CHECK:  ret <8 x half> [[ADD]]
+// CHECK-LABEL: define {{[^@]+}}@test_vpaddq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VPADDQ_V2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.faddp.v8f16(<8 x half> [[A]], <8 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VPADDQ_V3_I:%.*]] = bitcast <8 x half> [[VPADDQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <8 x half> [[VPADDQ_V2_I]]
+//
 float16x8_t test_vpaddq_f16(float16x8_t a, float16x8_t b) {
   return vpaddq_f16(a, b);
 }
 
-// CHECK-LABEL: test_vpmax_f16
-// CHECK:  [[MAX:%.*]] = call <4 x half> @llvm.aarch64.neon.fmaxp.v4f16(<4 x half> %a, <4 x half> %b)
-// CHECK:  ret <4 x half> [[MAX]]
+// CHECK-LABEL: define {{[^@]+}}@test_vpmax_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VPMAX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmaxp.v4f16(<4 x half> [[A]], <4 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x half> [[VPMAX2_I]]
+//
 float16x4_t test_vpmax_f16(float16x4_t a, float16x4_t b) {
   return vpmax_f16(a, b);
 }
 
-// CHECK-LABEL: test_vpmaxq_f16
-// CHECK:  [[MAX:%.*]] = call <8 x half> @llvm.aarch64.neon.fmaxp.v8f16(<8 x half> %a, <8 x half> %b)
-// CHECK:  ret <8 x half> [[MAX]]
+// CHECK-LABEL: define {{[^@]+}}@test_vpmaxq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VPMAX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmaxp.v8f16(<8 x half> [[A]], <8 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <8 x half> [[VPMAX2_I]]
+//
 float16x8_t test_vpmaxq_f16(float16x8_t a, float16x8_t b) {
   return vpmaxq_f16(a, b);
 }
 
-// CHECK-LABEL: test_vpmaxnm_f16
-// CHECK:  [[MAX:%.*]] = call <4 x half> @llvm.aarch64.neon.fmaxnmp.v4f16(<4 x half> %a, <4 x half> %b)
-// CHECK:  ret <4 x half> [[MAX]]
+// CHECK-LABEL: define {{[^@]+}}@test_vpmaxnm_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VPMAXNM2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmaxnmp.v4f16(<4 x half> [[A]], <4 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x half> [[VPMAXNM2_I]]
+//
 float16x4_t test_vpmaxnm_f16(float16x4_t a, float16x4_t b) {
   return vpmaxnm_f16(a, b);
 }
 
-// CHECK-LABEL: test_vpmaxnmq_f16
-// CHECK:  [[MAX:%.*]] = call <8 x half> @llvm.aarch64.neon.fmaxnmp.v8f16(<8 x half> %a, <8 x half> %b)
-// CHECK:  ret <8 x half> [[MAX]]
+// CHECK-LABEL: define {{[^@]+}}@test_vpmaxnmq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VPMAXNM2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmaxnmp.v8f16(<8 x half> [[A]], <8 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <8 x half> [[VPMAXNM2_I]]
+//
 float16x8_t test_vpmaxnmq_f16(float16x8_t a, float16x8_t b) {
   return vpmaxnmq_f16(a, b);
 }
 
-// CHECK-LABEL: test_vpmin_f16
-// CHECK:  [[MIN:%.*]] = call <4 x half> @llvm.aarch64.neon.fminp.v4f16(<4 x half> %a, <4 x half> %b)
-// CHECK:  ret <4 x half> [[MIN]]
+// CHECK-LABEL: define {{[^@]+}}@test_vpmin_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VPMIN2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fminp.v4f16(<4 x half> [[A]], <4 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x half> [[VPMIN2_I]]
+//
 float16x4_t test_vpmin_f16(float16x4_t a, float16x4_t b) {
   return vpmin_f16(a, b);
 }
 
-// CHECK-LABEL: test_vpminq_f16
-// CHECK:  [[MIN:%.*]] = call <8 x half> @llvm.aarch64.neon.fminp.v8f16(<8 x half> %a, <8 x half> %b)
-// CHECK:  ret <8 x half> [[MIN]]
+// CHECK-LABEL: define {{[^@]+}}@test_vpminq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VPMIN2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fminp.v8f16(<8 x half> [[A]], <8 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <8 x half> [[VPMIN2_I]]
+//
 float16x8_t test_vpminq_f16(float16x8_t a, float16x8_t b) {
   return vpminq_f16(a, b);
 }
 
-// CHECK-LABEL: test_vpminnm_f16
-// CHECK:  [[MIN:%.*]] = call <4 x half> @llvm.aarch64.neon.fminnmp.v4f16(<4 x half> %a, <4 x half> %b)
-// CHECK:  ret <4 x half> [[MIN]]
+// CHECK-LABEL: define {{[^@]+}}@test_vpminnm_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VPMINNM2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fminnmp.v4f16(<4 x half> [[A]], <4 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x half> [[VPMINNM2_I]]
+//
 float16x4_t test_vpminnm_f16(float16x4_t a, float16x4_t b) {
   return vpminnm_f16(a, b);
 }
 
-// CHECK-LABEL: test_vpminnmq_f16
-// CHECK:  [[MIN:%.*]] = call <8 x half> @llvm.aarch64.neon.fminnmp.v8f16(<8 x half> %a, <8 x half> %b)
-// CHECK:  ret <8 x half> [[MIN]]
+// CHECK-LABEL: define {{[^@]+}}@test_vpminnmq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VPMINNM2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fminnmp.v8f16(<8 x half> [[A]], <8 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <8 x half> [[VPMINNM2_I]]
+//
 float16x8_t test_vpminnmq_f16(float16x8_t a, float16x8_t b) {
   return vpminnmq_f16(a, b);
 }
 
-// CHECK-LABEL: test_vrecps_f16
-// CHECK:  [[MIN:%.*]] = call <4 x half> @llvm.aarch64.neon.frecps.v4f16(<4 x half> %a, <4 x half> %b)
-// CHECK:  ret <4 x half> [[MIN]]
+// CHECK-LABEL: define {{[^@]+}}@test_vrecps_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VRECPS_V2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.frecps.v4f16(<4 x half> [[A]], <4 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VRECPS_V3_I:%.*]] = bitcast <4 x half> [[VRECPS_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x half> [[VRECPS_V2_I]]
+//
 float16x4_t test_vrecps_f16(float16x4_t a, float16x4_t b) {
   return vrecps_f16(a, b);
 }
 
-// CHECK-LABEL: test_vrecpsq_f16
-// CHECK:  [[MIN:%.*]] =  call <8 x half> @llvm.aarch64.neon.frecps.v8f16(<8 x half> %a, <8 x half> %b)
-// CHECK:  ret <8 x half> [[MIN]]
+// CHECK-LABEL: define {{[^@]+}}@test_vrecpsq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VRECPSQ_V2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.frecps.v8f16(<8 x half> [[A]], <8 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VRECPSQ_V3_I:%.*]] = bitcast <8 x half> [[VRECPSQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <8 x half> [[VRECPSQ_V2_I]]
+//
 float16x8_t test_vrecpsq_f16(float16x8_t a, float16x8_t b) {
   return vrecpsq_f16(a, b);
 }
 
-// CHECK-LABEL: test_vrsqrts_f16
-// CHECK:  [[MIN:%.*]] = call <4 x half> @llvm.aarch64.neon.frsqrts.v4f16(<4 x half> %a, <4 x half> %b)
-// CHECK:  ret <4 x half> [[MIN]]
+// CHECK-LABEL: define {{[^@]+}}@test_vrsqrts_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VRSQRTS_V2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.frsqrts.v4f16(<4 x half> [[A]], <4 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VRSQRTS_V3_I:%.*]] = bitcast <4 x half> [[VRSQRTS_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <4 x half> [[VRSQRTS_V2_I]]
+//
 float16x4_t test_vrsqrts_f16(float16x4_t a, float16x4_t b) {
   return vrsqrts_f16(a, b);
 }
 
-// CHECK-LABEL: test_vrsqrtsq_f16
-// CHECK:  [[MIN:%.*]] =  call <8 x half> @llvm.aarch64.neon.frsqrts.v8f16(<8 x half> %a, <8 x half> %b)
-// CHECK:  ret <8 x half> [[MIN]]
+// CHECK-LABEL: define {{[^@]+}}@test_vrsqrtsq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VRSQRTSQ_V2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.frsqrts.v8f16(<8 x half> [[A]], <8 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VRSQRTSQ_V3_I:%.*]] = bitcast <8 x half> [[VRSQRTSQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <8 x half> [[VRSQRTSQ_V2_I]]
+//
 float16x8_t test_vrsqrtsq_f16(float16x8_t a, float16x8_t b) {
   return vrsqrtsq_f16(a, b);
 }
 
-// CHECK-LABEL: test_vsub_f16
-// CHECK:  [[ADD:%.*]] = fsub <4 x half> %a, %b 
-// CHECK:  ret <4 x half> [[ADD]]
+// CHECK-LABEL: define {{[^@]+}}@test_vsub_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[SUB_I:%.*]] = fsub <4 x half> [[A]], [[B]]
+// CHECK-NEXT:    ret <4 x half> [[SUB_I]]
+//
 float16x4_t test_vsub_f16(float16x4_t a, float16x4_t b) {
   return vsub_f16(a, b);
 }
 
-// CHECK-LABEL: test_vsubq_f16
-// CHECK:  [[ADD:%.*]] = fsub <8 x half> %a, %b
-// CHECK:  ret <8 x half> [[ADD]]
+// CHECK-LABEL: define {{[^@]+}}@test_vsubq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[SUB_I:%.*]] = fsub <8 x half> [[A]], [[B]]
+// CHECK-NEXT:    ret <8 x half> [[SUB_I]]
+//
 float16x8_t test_vsubq_f16(float16x8_t a, float16x8_t b) {
   return vsubq_f16(a, b);
 }
 
-// CHECK-LABEL: test_vfma_f16
-// CHECK:  [[ADD:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %a)
-// CHECK:  ret <4 x half> [[ADD]]
+// CHECK-LABEL: define {{[^@]+}}@test_vfma_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[B]], <4 x half> [[C]], <4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x half> [[TMP3]]
+//
 float16x4_t test_vfma_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
   return vfma_f16(a, b, c);
 }
 
-// CHECK-LABEL: test_vfmaq_f16
-// CHECK:  [[ADD:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %a)
-// CHECK:  ret <8 x half> [[ADD]]
+// CHECK-LABEL: define {{[^@]+}}@test_vfmaq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[B]], <8 x half> [[C]], <8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <8 x half> [[TMP3]]
+//
 float16x8_t test_vfmaq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
   return vfmaq_f16(a, b, c);
 }
 
-// CHECK-LABEL: test_vfms_f16
-// CHECK:  [[SUB:%.*]] = fneg <4 x half> %b
-// CHECK:  [[ADD:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[SUB]], <4 x half> %c, <4 x half> %a)
-// CHECK:  ret <4 x half> [[ADD]]
+// CHECK-LABEL: define {{[^@]+}}@test_vfms_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <4 x half> [[B]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[FNEG_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[FNEG_I]], <4 x half> [[C]], <4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x half> [[TMP3]]
+//
 float16x4_t test_vfms_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
   return vfms_f16(a, b, c);
 }
 
-// CHECK-LABEL: test_vfmsq_f16
-// CHECK:  [[SUB:%.*]] = fneg <8 x half> %b
-// CHECK:  [[ADD:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[SUB]], <8 x half> %c, <8 x half> %a)
-// CHECK:  ret <8 x half> [[ADD]]
+// CHECK-LABEL: define {{[^@]+}}@test_vfmsq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <8 x half> [[B]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[FNEG_I]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[FNEG_I]], <8 x half> [[C]], <8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <8 x half> [[TMP3]]
+//
 float16x8_t test_vfmsq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
   return vfmsq_f16(a, b, c);
 }
 
-// CHECK-LABEL: test_vfma_lane_f16
-// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
-// CHECK: [[TMP2:%.*]] = bitcast <4 x half> %c to <8 x i8>
-// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half>
-// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
-// CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK: [[FMLA:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[TMP4]], <4 x half> [[LANE]], <4 x half> [[TMP5]])
-// CHECK: ret <4 x half> [[FMLA]]
+// CHECK-LABEL: define {{[^@]+}}@test_vfma_lane_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
+// CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK-NEXT:    [[FMLA2:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[FMLA]], <4 x half> [[LANE]], <4 x half> [[FMLA1]])
+// CHECK-NEXT:    ret <4 x half> [[FMLA2]]
+//
 float16x4_t test_vfma_lane_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
   return vfma_lane_f16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vfmaq_lane_f16
-// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
-// CHECK: [[TMP2:%.*]] = bitcast <4 x half> %c to <8 x i8>
-// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half>
-// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
-// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
-// CHECK: [[FMLA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[TMP4]], <8 x half> [[LANE]], <8 x half> [[TMP5]])
-// CHECK: ret <8 x half> [[FMLA]]
+// CHECK-LABEL: define {{[^@]+}}@test_vfmaq_lane_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
+// CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
+// CHECK-NEXT:    [[FMLA2:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[FMLA]], <8 x half> [[LANE]], <8 x half> [[FMLA1]])
+// CHECK-NEXT:    ret <8 x half> [[FMLA2]]
+//
 float16x8_t test_vfmaq_lane_f16(float16x8_t a, float16x8_t b, float16x4_t c) {
   return vfmaq_lane_f16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vfma_laneq_f16
-// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
-// CHECK: [[TMP2:%.*]] = bitcast <8 x half> %c to <16 x i8>
-// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
-// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half>
-// CHECK: [[LANE:%.*]] = shufflevector <8 x half> [[TMP5]], <8 x half> [[TMP5]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK: [[FMLA:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[LANE]], <4 x half> [[TMP4]], <4 x half> [[TMP3]])
-// CHECK: ret <4 x half> [[FMLA]]
+// CHECK-LABEL: define {{[^@]+}}@test_vfma_laneq_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[TMP5]], <8 x half> [[TMP5]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[TMP6:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[LANE]], <4 x half> [[TMP4]], <4 x half> [[TMP3]])
+// CHECK-NEXT:    ret <4 x half> [[TMP6]]
+//
 float16x4_t test_vfma_laneq_f16(float16x4_t a, float16x4_t b, float16x8_t c) {
   return vfma_laneq_f16(a, b, c, 7);
 }
 
-// CHECK-LABEL: test_vfmaq_laneq_f16
-// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
-// CHECK: [[TMP2:%.*]] = bitcast <8 x half> %c to <16 x i8>
-// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
-// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
-// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half>
-// CHECK: [[LANE:%.*]] = shufflevector <8 x half> [[TMP5]], <8 x half> [[TMP5]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// CHECK: [[FMLA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[TMP4]], <8 x half> [[TMP3]])
-// CHECK: ret <8 x half> [[FMLA]]
+// CHECK-LABEL: define {{[^@]+}}@test_vfmaq_laneq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[TMP5]], <8 x half> [[TMP5]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[TMP6:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[TMP4]], <8 x half> [[TMP3]])
+// CHECK-NEXT:    ret <8 x half> [[TMP6]]
+//
 float16x8_t test_vfmaq_laneq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
   return vfmaq_laneq_f16(a, b, c, 7);
 }
 
-// CHECK-LABEL: test_vfma_n_f16
-// CHECK: [[TMP0:%.*]] = insertelement <4 x half> undef, half %c, i32 0
-// CHECK: [[TMP1:%.*]] = insertelement <4 x half> [[TMP0]], half %c, i32 1
-// CHECK: [[TMP2:%.*]] = insertelement <4 x half> [[TMP1]], half %c, i32 2
-// CHECK: [[TMP3:%.*]] = insertelement <4 x half> [[TMP2]], half %c, i32 3
-// CHECK: [[FMA:%.*]]  = call <4 x half> @llvm.fma.v4f16(<4 x half> %b, <4 x half> [[TMP3]], <4 x half> %a)
-// CHECK: ret <4 x half> [[FMA]]
+// CHECK-LABEL: define {{[^@]+}}@test_vfma_n_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], half noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[C]], i32 0
+// CHECK-NEXT:    [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[C]], i32 1
+// CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[C]], i32 2
+// CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[C]], i32 3
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[VECINIT3]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[B]], <4 x half> [[VECINIT3]], <4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x half> [[TMP3]]
+//
 float16x4_t test_vfma_n_f16(float16x4_t a, float16x4_t b, float16_t c) {
   return vfma_n_f16(a, b, c);
 }
 
-// CHECK-LABEL: test_vfmaq_n_f16
-// CHECK: [[TMP0:%.*]] = insertelement <8 x half> undef, half %c, i32 0
-// CHECK: [[TMP1:%.*]] = insertelement <8 x half> [[TMP0]], half %c, i32 1
-// CHECK: [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half %c, i32 2
-// CHECK: [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half %c, i32 3
-// CHECK: [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half %c, i32 4
-// CHECK: [[TMP5:%.*]] = insertelement <8 x half> [[TMP4]], half %c, i32 5
-// CHECK: [[TMP6:%.*]] = insertelement <8 x half> [[TMP5]], half %c, i32 6
-// CHECK: [[TMP7:%.*]] = insertelement <8 x half> [[TMP6]], half %c, i32 7
-// CHECK: [[FMA:%.*]]  = call <8 x half> @llvm.fma.v8f16(<8 x half> %b, <8 x half> [[TMP7]], <8 x half> %a)
-// CHECK: ret <8 x half> [[FMA]]
+// CHECK-LABEL: define {{[^@]+}}@test_vfmaq_n_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], half noundef [[C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[C]], i32 0
+// CHECK-NEXT:    [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[C]], i32 1
+// CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[C]], i32 2
+// CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[C]], i32 3
+// CHECK-NEXT:    [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[C]], i32 4
+// CHECK-NEXT:    [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[C]], i32 5
+// CHECK-NEXT:    [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[C]], i32 6
+// CHECK-NEXT:    [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[C]], i32 7
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[VECINIT7]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[B]], <8 x half> [[VECINIT7]], <8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <8 x half> [[TMP3]]
+//
 float16x8_t test_vfmaq_n_f16(float16x8_t a, float16x8_t b, float16_t c) {
   return vfmaq_n_f16(a, b, c);
 }
 
-// CHECK-LABEL: test_vfmah_lane_f16
-// CHECK: [[EXTR:%.*]] = extractelement <4 x half> %c, i32 3
-// CHECK: [[FMA:%.*]]  = call half @llvm.fma.f16(half %b, half [[EXTR]], half %a)
-// CHECK: ret half [[FMA]]
+// CHECK-LABEL: define {{[^@]+}}@test_vfmah_lane_f16
+// CHECK-SAME: (half noundef [[A:%.*]], half noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <4 x half> [[C]], i32 3
+// CHECK-NEXT:    [[TMP0:%.*]] = call half @llvm.fma.f16(half [[B]], half [[EXTRACT]], half [[A]])
+// CHECK-NEXT:    ret half [[TMP0]]
+//
 float16_t test_vfmah_lane_f16(float16_t a, float16_t b, float16x4_t c) {
   return vfmah_lane_f16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vfmah_laneq_f16
-// CHECK: [[EXTR:%.*]] = extractelement <8 x half> %c, i32 7
-// CHECK: [[FMA:%.*]]  = call half @llvm.fma.f16(half %b, half [[EXTR]], half %a)
-// CHECK: ret half [[FMA]]
+// CHECK-LABEL: define {{[^@]+}}@test_vfmah_laneq_f16
+// CHECK-SAME: (half noundef [[A:%.*]], half noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <8 x half> [[C]], i32 7
+// CHECK-NEXT:    [[TMP0:%.*]] = call half @llvm.fma.f16(half [[B]], half [[EXTRACT]], half [[A]])
+// CHECK-NEXT:    ret half [[TMP0]]
+//
 float16_t test_vfmah_laneq_f16(float16_t a, float16_t b, float16x8_t c) {
   return vfmah_laneq_f16(a, b, c, 7);
 }
 
-// CHECK-LABEL: test_vfms_lane_f16
-// CHECK: [[SUB:%.*]]  = fneg <4 x half> %b
-// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <4 x half> [[SUB]] to <8 x i8>
-// CHECK: [[TMP2:%.*]] = bitcast <4 x half> %c to <8 x i8>
-// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half>
-// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
-// CHECK: [[TMP5:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK: [[FMA:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[TMP4]], <4 x half> [[LANE]], <4 x half> [[TMP5]])
-// CHECK: ret <4 x half> [[FMA]]
+// CHECK-LABEL: define {{[^@]+}}@test_vfms_lane_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[FNEG:%.*]] = fneg <4 x half> [[B]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[FNEG]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
+// CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK-NEXT:    [[FMLA2:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[FMLA]], <4 x half> [[LANE]], <4 x half> [[FMLA1]])
+// CHECK-NEXT:    ret <4 x half> [[FMLA2]]
+//
 float16x4_t test_vfms_lane_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
   return vfms_lane_f16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vfmsq_lane_f16
-// CHECK: [[SUB:%.*]]  = fneg <8 x half> %b
-// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <8 x half> [[SUB]] to <16 x i8>
-// CHECK: [[TMP2:%.*]] = bitcast <4 x half> %c to <8 x i8>
-// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half>
-// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
-// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
-// CHECK: [[FMLA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[TMP4]], <8 x half> [[LANE]], <8 x half> [[TMP5]])
-// CHECK: ret <8 x half> [[FMLA]]
+// CHECK-LABEL: define {{[^@]+}}@test_vfmsq_lane_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[FNEG:%.*]] = fneg <8 x half> [[B]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[FNEG]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x half>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
+// CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
+// CHECK-NEXT:    [[FMLA2:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[FMLA]], <8 x half> [[LANE]], <8 x half> [[FMLA1]])
+// CHECK-NEXT:    ret <8 x half> [[FMLA2]]
+//
 float16x8_t test_vfmsq_lane_f16(float16x8_t a, float16x8_t b, float16x4_t c) {
   return vfmsq_lane_f16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vfms_laneq_f16
-// CHECK: [[SUB:%.*]]  = fneg <4 x half> %b
-// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <4 x half> [[SUB]] to <8 x i8>
-// CHECK: [[TMP2:%.*]] = bitcast <8 x half> %c to <16 x i8>
-// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK: [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
-// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half>
-// CHECK: [[LANE:%.*]] = shufflevector <8 x half> [[TMP5]], <8 x half> [[TMP5]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK: [[FMLA:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[LANE]], <4 x half> [[TMP4]], <4 x half> [[TMP3]])
-// CHECK: ret <4 x half> [[FMLA]]
+// CHECK-LABEL: define {{[^@]+}}@test_vfms_laneq_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[FNEG:%.*]] = fneg <4 x half> [[B]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[FNEG]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[TMP5]], <8 x half> [[TMP5]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[TMP6:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[LANE]], <4 x half> [[TMP4]], <4 x half> [[TMP3]])
+// CHECK-NEXT:    ret <4 x half> [[TMP6]]
+//
 float16x4_t test_vfms_laneq_f16(float16x4_t a, float16x4_t b, float16x8_t c) {
   return vfms_laneq_f16(a, b, c, 7);
 }
 
-// CHECK-LABEL: test_vfmsq_laneq_f16
-// CHECK: [[SUB:%.*]]  = fneg <8 x half> %b
-// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <8 x half> [[SUB]] to <16 x i8>
-// CHECK: [[TMP2:%.*]] = bitcast <8 x half> %c to <16 x i8>
-// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
-// CHECK: [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
-// CHECK: [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half>
-// CHECK: [[LANE:%.*]] = shufflevector <8 x half> [[TMP5]], <8 x half> [[TMP5]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// CHECK: [[FMLA:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[TMP4]], <8 x half> [[TMP3]])
-// CHECK: ret <8 x half> [[FMLA]]
+// CHECK-LABEL: define {{[^@]+}}@test_vfmsq_laneq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[FNEG:%.*]] = fneg <8 x half> [[B]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[FNEG]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x half>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[TMP5]], <8 x half> [[TMP5]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[TMP6:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[LANE]], <8 x half> [[TMP4]], <8 x half> [[TMP3]])
+// CHECK-NEXT:    ret <8 x half> [[TMP6]]
+//
 float16x8_t test_vfmsq_laneq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
   return vfmsq_laneq_f16(a, b, c, 7);
 }
 
-// CHECK-LABEL: test_vfms_n_f16
-// CHECK: [[SUB:%.*]]  = fneg <4 x half> %b
-// CHECK: [[TMP0:%.*]] = insertelement <4 x half> undef, half %c, i32 0
-// CHECK: [[TMP1:%.*]] = insertelement <4 x half> [[TMP0]], half %c, i32 1
-// CHECK: [[TMP2:%.*]] = insertelement <4 x half> [[TMP1]], half %c, i32 2
-// CHECK: [[TMP3:%.*]] = insertelement <4 x half> [[TMP2]], half %c, i32 3
-// CHECK: [[FMA:%.*]]  = call <4 x half> @llvm.fma.v4f16(<4 x half> [[SUB]], <4 x half> [[TMP3]], <4 x half> %a)
-// CHECK: ret <4 x half> [[FMA]]
+// CHECK-LABEL: define {{[^@]+}}@test_vfms_n_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], half noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[FNEG:%.*]] = fneg <4 x half> [[B]]
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[C]], i32 0
+// CHECK-NEXT:    [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[C]], i32 1
+// CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[C]], i32 2
+// CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[C]], i32 3
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[FNEG]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[VECINIT3]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[FNEG]], <4 x half> [[VECINIT3]], <4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x half> [[TMP3]]
+//
 float16x4_t test_vfms_n_f16(float16x4_t a, float16x4_t b, float16_t c) {
   return vfms_n_f16(a, b, c);
 }
 
-// CHECK-LABEL: test_vfmsq_n_f16
-// CHECK: [[SUB:%.*]]  = fneg <8 x half> %b
-// CHECK: [[TMP0:%.*]] = insertelement <8 x half> undef, half %c, i32 0
-// CHECK: [[TMP1:%.*]] = insertelement <8 x half> [[TMP0]], half %c, i32 1
-// CHECK: [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half %c, i32 2
-// CHECK: [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half %c, i32 3
-// CHECK: [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half %c, i32 4
-// CHECK: [[TMP5:%.*]] = insertelement <8 x half> [[TMP4]], half %c, i32 5
-// CHECK: [[TMP6:%.*]] = insertelement <8 x half> [[TMP5]], half %c, i32 6
-// CHECK: [[TMP7:%.*]] = insertelement <8 x half> [[TMP6]], half %c, i32 7
-// CHECK: [[FMA:%.*]]  = call <8 x half> @llvm.fma.v8f16(<8 x half> [[SUB]], <8 x half> [[TMP7]], <8 x half> %a)
-// CHECK: ret <8 x half> [[FMA]]
+// CHECK-LABEL: define {{[^@]+}}@test_vfmsq_n_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], half noundef [[C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[FNEG:%.*]] = fneg <8 x half> [[B]]
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[C]], i32 0
+// CHECK-NEXT:    [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[C]], i32 1
+// CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[C]], i32 2
+// CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[C]], i32 3
+// CHECK-NEXT:    [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[C]], i32 4
+// CHECK-NEXT:    [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[C]], i32 5
+// CHECK-NEXT:    [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[C]], i32 6
+// CHECK-NEXT:    [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[C]], i32 7
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[FNEG]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[VECINIT7]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[FNEG]], <8 x half> [[VECINIT7]], <8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <8 x half> [[TMP3]]
+//
 float16x8_t test_vfmsq_n_f16(float16x8_t a, float16x8_t b, float16_t c) {
   return vfmsq_n_f16(a, b, c);
 }
 
-// CHECK-LABEL: test_vfmsh_lane_f16
-// CHECK: [[TMP0:%.*]] = fpext half %b to float
-// CHECK: [[TMP1:%.*]] = fneg float [[TMP0]]
-// CHECK: [[SUB:%.*]]  = fptrunc float [[TMP1]] to half
-// CHECK: [[EXTR:%.*]] = extractelement <4 x half> %c, i32 3
-// CHECK: [[FMA:%.*]]  = call half @llvm.fma.f16(half [[SUB]], half [[EXTR]], half %a)
-// CHECK: ret half [[FMA]]
+// CHECK-LABEL: define {{[^@]+}}@test_vfmsh_lane_f16
+// CHECK-SAME: (half noundef [[A:%.*]], half noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[CONV:%.*]] = fpext half [[B]] to float
+// CHECK-NEXT:    [[FNEG:%.*]] = fneg float [[CONV]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fptrunc float [[FNEG]] to half
+// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <4 x half> [[C]], i32 3
+// CHECK-NEXT:    [[TMP1:%.*]] = call half @llvm.fma.f16(half [[TMP0]], half [[EXTRACT]], half [[A]])
+// CHECK-NEXT:    ret half [[TMP1]]
+//
 float16_t test_vfmsh_lane_f16(float16_t a, float16_t b, float16x4_t c) {
   return vfmsh_lane_f16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vfmsh_laneq_f16
-// CHECK: [[TMP0:%.*]] = fpext half %b to float
-// CHECK: [[TMP1:%.*]] = fneg float [[TMP0]]
-// CHECK: [[SUB:%.*]]  = fptrunc float [[TMP1]] to half
-// CHECK: [[EXTR:%.*]] = extractelement <8 x half> %c, i32 7
-// CHECK: [[FMA:%.*]]  = call half @llvm.fma.f16(half [[SUB]], half [[EXTR]], half %a)
-// CHECK: ret half [[FMA]]
+// CHECK-LABEL: define {{[^@]+}}@test_vfmsh_laneq_f16
+// CHECK-SAME: (half noundef [[A:%.*]], half noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[CONV:%.*]] = fpext half [[B]] to float
+// CHECK-NEXT:    [[FNEG:%.*]] = fneg float [[CONV]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fptrunc float [[FNEG]] to half
+// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <8 x half> [[C]], i32 7
+// CHECK-NEXT:    [[TMP1:%.*]] = call half @llvm.fma.f16(half [[TMP0]], half [[EXTRACT]], half [[A]])
+// CHECK-NEXT:    ret half [[TMP1]]
+//
 float16_t test_vfmsh_laneq_f16(float16_t a, float16_t b, float16x8_t c) {
   return vfmsh_laneq_f16(a, b, c, 7);
 }
 
-// CHECK-LABEL: test_vmul_lane_f16
-// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK: [[MUL:%.*]] = fmul <4 x half> [[A:%.*]], [[LANE]]
-// CHECK: ret <4 x half> [[MUL]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmul_lane_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x half> [[A]], [[LANE]]
+// CHECK-NEXT:    ret <4 x half> [[MUL]]
+//
 float16x4_t test_vmul_lane_f16(float16x4_t a, float16x4_t b) {
   return vmul_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vmulq_lane_f16
-// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK: [[MUL:%.*]] = fmul <8 x half> [[A:%.*]], [[LANE]]
-// CHECK: ret <8 x half> [[MUL]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmulq_lane_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[MUL:%.*]] = fmul <8 x half> [[A]], [[LANE]]
+// CHECK-NEXT:    ret <8 x half> [[MUL]]
+//
 float16x8_t test_vmulq_lane_f16(float16x8_t a, float16x4_t b) {
   return vmulq_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vmul_laneq_f16
-// CHECK: [[TMP0:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
-// CHECK: [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK: [[MUL:%.*]] = fmul <4 x half> [[A:%.*]], [[LANE]]
-// CHECK: ret <4 x half> [[MUL]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmul_laneq_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x half> [[A]], [[LANE]]
+// CHECK-NEXT:    ret <4 x half> [[MUL]]
+//
 float16x4_t test_vmul_laneq_f16(float16x4_t a, float16x8_t b) {
   return vmul_laneq_f16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vmulq_laneq_f16
-// CHECK: [[TMP0:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
-// CHECK: [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// CHECK: [[MUL:%.*]] = fmul <8 x half> [[A:%.*]], [[LANE]]
-// CHECK: ret <8 x half> [[MUL]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmulq_laneq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[MUL:%.*]] = fmul <8 x half> [[A]], [[LANE]]
+// CHECK-NEXT:    ret <8 x half> [[MUL]]
+//
 float16x8_t test_vmulq_laneq_f16(float16x8_t a, float16x8_t b) {
   return vmulq_laneq_f16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vmul_n_f16
-// CHECK: [[TMP0:%.*]] = insertelement <4 x half> undef, half %b, i32 0
-// CHECK: [[TMP1:%.*]] = insertelement <4 x half> [[TMP0]], half %b, i32 1
-// CHECK: [[TMP2:%.*]] = insertelement <4 x half> [[TMP1]], half %b, i32 2
-// CHECK: [[TMP3:%.*]] = insertelement <4 x half> [[TMP2]], half %b, i32 3
-// CHECK: [[MUL:%.*]]  = fmul <4 x half> %a, [[TMP3]]
-// CHECK: ret <4 x half> [[MUL]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmul_n_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], half noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[B]], i32 0
+// CHECK-NEXT:    [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[B]], i32 1
+// CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[B]], i32 2
+// CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[B]], i32 3
+// CHECK-NEXT:    [[MUL:%.*]] = fmul <4 x half> [[A]], [[VECINIT3]]
+// CHECK-NEXT:    ret <4 x half> [[MUL]]
+//
 float16x4_t test_vmul_n_f16(float16x4_t a, float16_t b) {
   return vmul_n_f16(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_n_f16
-// CHECK: [[TMP0:%.*]] = insertelement <8 x half> undef, half %b, i32 0
-// CHECK: [[TMP1:%.*]] = insertelement <8 x half> [[TMP0]], half %b, i32 1
-// CHECK: [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half %b, i32 2
-// CHECK: [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half %b, i32 3
-// CHECK: [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half %b, i32 4
-// CHECK: [[TMP5:%.*]] = insertelement <8 x half> [[TMP4]], half %b, i32 5
-// CHECK: [[TMP6:%.*]] = insertelement <8 x half> [[TMP5]], half %b, i32 6
-// CHECK: [[TMP7:%.*]] = insertelement <8 x half> [[TMP6]], half %b, i32 7
-// CHECK: [[MUL:%.*]]  = fmul <8 x half> %a, [[TMP7]]
-// CHECK: ret <8 x half> [[MUL]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmulq_n_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], half noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[B]], i32 0
+// CHECK-NEXT:    [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[B]], i32 1
+// CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[B]], i32 2
+// CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[B]], i32 3
+// CHECK-NEXT:    [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[B]], i32 4
+// CHECK-NEXT:    [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[B]], i32 5
+// CHECK-NEXT:    [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[B]], i32 6
+// CHECK-NEXT:    [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[B]], i32 7
+// CHECK-NEXT:    [[MUL:%.*]] = fmul <8 x half> [[A]], [[VECINIT7]]
+// CHECK-NEXT:    ret <8 x half> [[MUL]]
+//
 float16x8_t test_vmulq_n_f16(float16x8_t a, float16_t b) {
   return vmulq_n_f16(a, b);
 }
 
 // FIXME: Fix it when fp16 non-storage-only type becomes available.
-// CHECK-LABEL: test_vmulh_lane_f16
-// CHECK: [[CONV0:%.*]] = fpext half %a to float
-// CHECK: [[CONV1:%.*]] = fpext half %{{.*}} to float
-// CHECK: [[MUL:%.*]]   = fmul float [[CONV0:%.*]], [[CONV0:%.*]]
-// CHECK: [[CONV3:%.*]] = fptrunc float %mul to half
-// CHECK: ret half [[CONV3:%.*]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmulh_lane_f16
+// CHECK-SAME: (half noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__REINT_851:%.*]] = alloca <4 x half>, align 8
+// CHECK-NEXT:    [[__REINT1_851:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[CONV:%.*]] = fpext half [[A]] to float
+// CHECK-NEXT:    store <4 x half> [[B]], <4 x half>* [[__REINT_851]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_851]] to <4 x i16>*
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 8
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK-NEXT:    store i16 [[VGET_LANE]], i16* [[__REINT1_851]], align 2
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[__REINT1_851]] to half*
+// CHECK-NEXT:    [[TMP3:%.*]] = load half, half* [[TMP2]], align 2
+// CHECK-NEXT:    [[CONV2:%.*]] = fpext half [[TMP3]] to float
+// CHECK-NEXT:    [[MUL:%.*]] = fmul float [[CONV]], [[CONV2]]
+// CHECK-NEXT:    [[TMP4:%.*]] = fptrunc float [[MUL]] to half
+// CHECK-NEXT:    ret half [[TMP4]]
+//
 float16_t test_vmulh_lane_f16(float16_t a, float16x4_t b) {
   return vmulh_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vmulh_laneq_f16
-// CHECK: [[CONV0:%.*]] = fpext half %a to float
-// CHECK: [[CONV1:%.*]] = fpext half %{{.*}} to float
-// CHECK: [[MUL:%.*]]   = fmul float [[CONV0:%.*]], [[CONV0:%.*]]
-// CHECK: [[CONV3:%.*]] = fptrunc float %mul to half
-// CHECK: ret half [[CONV3:%.*]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmulh_laneq_f16
+// CHECK-SAME: (half noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[__REINT_854:%.*]] = alloca <8 x half>, align 16
+// CHECK-NEXT:    [[__REINT1_854:%.*]] = alloca i16, align 2
+// CHECK-NEXT:    [[CONV:%.*]] = fpext half [[A]] to float
+// CHECK-NEXT:    store <8 x half> [[B]], <8 x half>* [[__REINT_854]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_854]] to <8 x i16>*
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 16
+// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK-NEXT:    store i16 [[VGETQ_LANE]], i16* [[__REINT1_854]], align 2
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[__REINT1_854]] to half*
+// CHECK-NEXT:    [[TMP3:%.*]] = load half, half* [[TMP2]], align 2
+// CHECK-NEXT:    [[CONV2:%.*]] = fpext half [[TMP3]] to float
+// CHECK-NEXT:    [[MUL:%.*]] = fmul float [[CONV]], [[CONV2]]
+// CHECK-NEXT:    [[TMP4:%.*]] = fptrunc float [[MUL]] to half
+// CHECK-NEXT:    ret half [[TMP4]]
+//
 float16_t test_vmulh_laneq_f16(float16_t a, float16x8_t b) {
   return vmulh_laneq_f16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vmulx_lane_f16
-// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK: [[TMP2:%.*]] = bitcast <4 x half> [[A:%.*]] to <8 x i8>
-// CHECK: [[TMP3:%.*]] = bitcast <4 x half> [[LANE]] to <8 x i8>
-// CHECK: [[VMULX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> [[A]], <4 x half> [[LANE]]) #4
-// CHECK: ret <4 x half> [[VMULX2_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmulx_lane_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x half> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> [[A]], <4 x half> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x half> [[VMULX2_I]]
+//
 float16x4_t test_vmulx_lane_f16(float16x4_t a, float16x4_t b) {
   return vmulx_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vmulxq_lane_f16
-// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK: [[TMP2:%.*]] = bitcast <8 x half> [[A:%.*]] to <16 x i8>
-// CHECK: [[TMP3:%.*]] = bitcast <8 x half> [[LANE]] to <16 x i8>
-// CHECK: [[VMULX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> [[A]], <8 x half> [[LANE]]) #4
-// CHECK: ret <8 x half> [[VMULX2_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmulxq_lane_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[LANE]] to <16 x i8>
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> [[A]], <8 x half> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <8 x half> [[VMULX2_I]]
+//
 float16x8_t test_vmulxq_lane_f16(float16x8_t a, float16x4_t b) {
   return vmulxq_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vmulx_laneq_f16
-// CHECK: [[TMP0:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
-// CHECK: [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK: [[TMP2:%.*]] = bitcast <4 x half> [[A:%.*]] to <8 x i8>
-// CHECK: [[TMP3:%.*]] = bitcast <4 x half> [[LANE]] to <8 x i8>
-// CHECK: [[VMULX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> [[A]], <4 x half> [[LANE]]) #4
-// CHECK: ret <4 x half> [[VMULX2_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmulx_laneq_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x half> [[LANE]] to <8 x i8>
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> [[A]], <4 x half> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x half> [[VMULX2_I]]
+//
 float16x4_t test_vmulx_laneq_f16(float16x4_t a, float16x8_t b) {
   return vmulx_laneq_f16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vmulxq_laneq_f16
-// CHECK: [[TMP0:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
-// CHECK: [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// CHECK: [[TMP2:%.*]] = bitcast <8 x half> [[A:%.*]] to <16 x i8>
-// CHECK: [[TMP3:%.*]] = bitcast <8 x half> [[LANE]] to <16 x i8>
-// CHECK: [[VMULX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> [[A]], <8 x half> [[LANE]]) #4
-// CHECK: ret <8 x half> [[VMULX2_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmulxq_laneq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[LANE]] to <16 x i8>
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> [[A]], <8 x half> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <8 x half> [[VMULX2_I]]
+//
 float16x8_t test_vmulxq_laneq_f16(float16x8_t a, float16x8_t b) {
   return vmulxq_laneq_f16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vmulx_n_f16
-// CHECK: [[TMP0:%.*]] = insertelement <4 x half> undef, half %b, i32 0
-// CHECK: [[TMP1:%.*]] = insertelement <4 x half> [[TMP0]], half %b, i32 1
-// CHECK: [[TMP2:%.*]] = insertelement <4 x half> [[TMP1]], half %b, i32 2
-// CHECK: [[TMP3:%.*]] = insertelement <4 x half> [[TMP2]], half %b, i32 3
-// CHECK: [[MUL:%.*]]  = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> %a, <4 x half> [[TMP3]])
-// CHECK: ret <4 x half> [[MUL]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmulx_n_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], half noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[B]], i32 0
+// CHECK-NEXT:    [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[B]], i32 1
+// CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[B]], i32 2
+// CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[B]], i32 3
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[VECINIT3]] to <8 x i8>
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> [[A]], <4 x half> [[VECINIT3]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x half> [[VMULX2_I]]
+//
 float16x4_t test_vmulx_n_f16(float16x4_t a, float16_t b) {
   return vmulx_n_f16(a, b);
 }
 
-// CHECK-LABEL: test_vmulxq_n_f16
-// CHECK: [[TMP0:%.*]] = insertelement <8 x half> undef, half %b, i32 0
-// CHECK: [[TMP1:%.*]] = insertelement <8 x half> [[TMP0]], half %b, i32 1
-// CHECK: [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half %b, i32 2
-// CHECK: [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half %b, i32 3
-// CHECK: [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half %b, i32 4
-// CHECK: [[TMP5:%.*]] = insertelement <8 x half> [[TMP4]], half %b, i32 5
-// CHECK: [[TMP6:%.*]] = insertelement <8 x half> [[TMP5]], half %b, i32 6
-// CHECK: [[TMP7:%.*]] = insertelement <8 x half> [[TMP6]], half %b, i32 7
-// CHECK: [[MUL:%.*]]  = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> %a, <8 x half> [[TMP7]])
-// CHECK: ret <8 x half> [[MUL]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmulxq_n_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], half noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[B]], i32 0
+// CHECK-NEXT:    [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[B]], i32 1
+// CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[B]], i32 2
+// CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[B]], i32 3
+// CHECK-NEXT:    [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[B]], i32 4
+// CHECK-NEXT:    [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[B]], i32 5
+// CHECK-NEXT:    [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[B]], i32 6
+// CHECK-NEXT:    [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[B]], i32 7
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[VECINIT7]] to <16 x i8>
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> [[A]], <8 x half> [[VECINIT7]]) #[[ATTR4]]
+// CHECK-NEXT:    ret <8 x half> [[VMULX2_I]]
+//
 float16x8_t test_vmulxq_n_f16(float16x8_t a, float16_t b) {
   return vmulxq_n_f16(a, b);
 }
 
-// CHECK-LABEL: test_vmulxh_lane_f16
-// CHECK: [[EXTR:%.*]] = extractelement <4 x half> %b, i32 3
-// CHECK: [[MULX:%.*]] = call half @llvm.aarch64.neon.fmulx.f16(half %a, half [[EXTR]]
-// CHECK: ret half [[MULX]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmulxh_lane_f16
+// CHECK-SAME: (half noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <4 x half> [[B]], i32 3
+// CHECK-NEXT:    [[VMULX:%.*]] = call half @llvm.aarch64.neon.fmulx.f16(half [[A]], half [[EXTRACT]])
+// CHECK-NEXT:    ret half [[VMULX]]
+//
 float16_t test_vmulxh_lane_f16(float16_t a, float16x4_t b) {
   return vmulxh_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vmulxh_laneq_f16
-// CHECK: [[EXTR:%.*]] = extractelement <8 x half> %b, i32 7
-// CHECK: [[MULX:%.*]] = call half @llvm.aarch64.neon.fmulx.f16(half %a, half [[EXTR]])
-// CHECK: ret half [[MULX]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmulxh_laneq_f16
+// CHECK-SAME: (half noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <8 x half> [[B]], i32 7
+// CHECK-NEXT:    [[VMULX:%.*]] = call half @llvm.aarch64.neon.fmulx.f16(half [[A]], half [[EXTRACT]])
+// CHECK-NEXT:    ret half [[VMULX]]
+//
 float16_t test_vmulxh_laneq_f16(float16_t a, float16x8_t b) {
   return vmulxh_laneq_f16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vmaxv_f16
-// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK: [[MAX:%.*]]  = call half @llvm.aarch64.neon.fmaxv.f16.v4f16(<4 x half> [[TMP1]])
-// CHECK: ret half [[MAX]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmaxv_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VMAXV:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK-NEXT:    [[VMAXV1:%.*]] = call half @llvm.aarch64.neon.fmaxv.f16.v4f16(<4 x half> [[VMAXV]])
+// CHECK-NEXT:    ret half [[VMAXV1]]
+//
 float16_t test_vmaxv_f16(float16x4_t a) {
   return vmaxv_f16(a);
 }
 
-// CHECK-LABEL: test_vmaxvq_f16
-// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
-// CHECK: [[MAX:%.*]]  = call half @llvm.aarch64.neon.fmaxv.f16.v8f16(<8 x half> [[TMP1]])
-// CHECK: ret half [[MAX]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmaxvq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VMAXV:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
+// CHECK-NEXT:    [[VMAXV1:%.*]] = call half @llvm.aarch64.neon.fmaxv.f16.v8f16(<8 x half> [[VMAXV]])
+// CHECK-NEXT:    ret half [[VMAXV1]]
+//
 float16_t test_vmaxvq_f16(float16x8_t a) {
   return vmaxvq_f16(a);
 }
 
-// CHECK-LABEL: test_vminv_f16
-// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK: [[MAX:%.*]]  = call half @llvm.aarch64.neon.fminv.f16.v4f16(<4 x half> [[TMP1]])
-// CHECK: ret half [[MAX]]
+// CHECK-LABEL: define {{[^@]+}}@test_vminv_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VMINV:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK-NEXT:    [[VMINV1:%.*]] = call half @llvm.aarch64.neon.fminv.f16.v4f16(<4 x half> [[VMINV]])
+// CHECK-NEXT:    ret half [[VMINV1]]
+//
 float16_t test_vminv_f16(float16x4_t a) {
   return vminv_f16(a);
 }
 
-// CHECK-LABEL: test_vminvq_f16
-// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
-// CHECK: [[MAX:%.*]]  = call half @llvm.aarch64.neon.fminv.f16.v8f16(<8 x half> [[TMP1]])
-// CHECK: ret half [[MAX]]
+// CHECK-LABEL: define {{[^@]+}}@test_vminvq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VMINV:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
+// CHECK-NEXT:    [[VMINV1:%.*]] = call half @llvm.aarch64.neon.fminv.f16.v8f16(<8 x half> [[VMINV]])
+// CHECK-NEXT:    ret half [[VMINV1]]
+//
 float16_t test_vminvq_f16(float16x8_t a) {
   return vminvq_f16(a);
 }
 
-// CHECK-LABEL: test_vmaxnmv_f16
-// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK: [[MAX:%.*]]  = call half @llvm.aarch64.neon.fmaxnmv.f16.v4f16(<4 x half> [[TMP1]])
-// CHECK: ret half [[MAX]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmaxnmv_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VMAXNMV:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK-NEXT:    [[VMAXNMV1:%.*]] = call half @llvm.aarch64.neon.fmaxnmv.f16.v4f16(<4 x half> [[VMAXNMV]])
+// CHECK-NEXT:    ret half [[VMAXNMV1]]
+//
 float16_t test_vmaxnmv_f16(float16x4_t a) {
   return vmaxnmv_f16(a);
 }
 
-// CHECK-LABEL: test_vmaxnmvq_f16
-// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
-// CHECK: [[MAX:%.*]]  = call half @llvm.aarch64.neon.fmaxnmv.f16.v8f16(<8 x half> [[TMP1]])
-// CHECK: ret half [[MAX]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmaxnmvq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VMAXNMV:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
+// CHECK-NEXT:    [[VMAXNMV1:%.*]] = call half @llvm.aarch64.neon.fmaxnmv.f16.v8f16(<8 x half> [[VMAXNMV]])
+// CHECK-NEXT:    ret half [[VMAXNMV1]]
+//
 float16_t test_vmaxnmvq_f16(float16x8_t a) {
   return vmaxnmvq_f16(a);
 }
 
-// CHECK-LABEL: test_vminnmv_f16
-// CHECK: [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK: [[MAX:%.*]]  = call half @llvm.aarch64.neon.fminnmv.f16.v4f16(<4 x half> [[TMP1]])
-// CHECK: ret half [[MAX]]
+// CHECK-LABEL: define {{[^@]+}}@test_vminnmv_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VMINNMV:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK-NEXT:    [[VMINNMV1:%.*]] = call half @llvm.aarch64.neon.fminnmv.f16.v4f16(<4 x half> [[VMINNMV]])
+// CHECK-NEXT:    ret half [[VMINNMV1]]
+//
 float16_t test_vminnmv_f16(float16x4_t a) {
   return vminnmv_f16(a);
 }
 
-// CHECK-LABEL: test_vminnmvq_f16
-// CHECK: [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
-// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
-// CHECK: [[MAX:%.*]]  = call half @llvm.aarch64.neon.fminnmv.f16.v8f16(<8 x half> [[TMP1]])
-// CHECK: ret half [[MAX]]
+// CHECK-LABEL: define {{[^@]+}}@test_vminnmvq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VMINNMV:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
+// CHECK-NEXT:    [[VMINNMV1:%.*]] = call half @llvm.aarch64.neon.fminnmv.f16.v8f16(<8 x half> [[VMINNMV]])
+// CHECK-NEXT:    ret half [[VMINNMV1]]
+//
 float16_t test_vminnmvq_f16(float16x8_t a) {
   return vminnmvq_f16(a);
 }
 
-// CHECK-LABEL: test_vbsl_f16
-// CHECK:  [[TMP0:%.*]] = bitcast <4 x half> %b to <8 x i8>
-// CHECK:  [[TMP1:%.*]] = bitcast <4 x half> %c to <8 x i8>
-// CHECK:  [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
-// CHECK:  [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:  [[TMP4:%.*]] = and <4 x i16> %a, [[TMP2]]
-// CHECK:  [[TMP5:%.*]] = xor <4 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1>
-// CHECK:  [[TMP6:%.*]] = and <4 x i16> [[TMP5]], [[TMP3]]
-// CHECK:  [[TMP7:%.*]] = or <4 x i16> [[TMP4]], [[TMP6]]
-// CHECK:  [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <4 x half>
-// CHECK:  ret <4 x half> [[TMP8]]
+// CHECK-LABEL: define {{[^@]+}}@test_vbsl_f16
+// CHECK-SAME: (<4 x i16> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <8 x i8>
+// CHECK-NEXT:    [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK-NEXT:    [[VBSL3_I:%.*]] = and <4 x i16> [[A]], [[VBSL1_I]]
+// CHECK-NEXT:    [[TMP3:%.*]] = xor <4 x i16> [[A]], <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK-NEXT:    [[VBSL4_I:%.*]] = and <4 x i16> [[TMP3]], [[VBSL2_I]]
+// CHECK-NEXT:    [[VBSL5_I:%.*]] = or <4 x i16> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[VBSL5_I]] to <4 x half>
+// CHECK-NEXT:    ret <4 x half> [[TMP4]]
+//
 float16x4_t test_vbsl_f16(uint16x4_t a, float16x4_t b, float16x4_t c) {
   return vbsl_f16(a, b, c);
 }
 
-// CHECK-LABEL: test_vbslq_f16
-// CHECK:  [[TMP0:%.*]] = bitcast <8 x half> %b to <16 x i8>
-// CHECK:  [[TMP1:%.*]] = bitcast <8 x half> %c to <16 x i8>
-// CHECK:  [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
-// CHECK:  [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:  [[TMP4:%.*]] = and <8 x i16> %a, [[TMP2]]
-// CHECK:  [[TMP5:%.*]] = xor <8 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
-// CHECK:  [[TMP6:%.*]] = and <8 x i16> [[TMP5]], [[TMP3]]
-// CHECK:  [[TMP7:%.*]] = or <8 x i16> [[TMP4]], [[TMP6]]
-// CHECK:  [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <8 x half>
-// CHECK:  ret <8 x half> [[TMP8]]
+// CHECK-LABEL: define {{[^@]+}}@test_vbslq_f16
+// CHECK-SAME: (<8 x i16> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <16 x i8>
+// CHECK-NEXT:    [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK-NEXT:    [[VBSL3_I:%.*]] = and <8 x i16> [[A]], [[VBSL1_I]]
+// CHECK-NEXT:    [[TMP3:%.*]] = xor <8 x i16> [[A]], <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK-NEXT:    [[VBSL4_I:%.*]] = and <8 x i16> [[TMP3]], [[VBSL2_I]]
+// CHECK-NEXT:    [[VBSL5_I:%.*]] = or <8 x i16> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[VBSL5_I]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP4]]
+//
 float16x8_t test_vbslq_f16(uint16x8_t a, float16x8_t b, float16x8_t c) {
   return vbslq_f16(a, b, c);
 }
 
-// CHECK-LABEL: test_vzip_f16
-// CHECK:   [[RETVAL:%.*]]  = alloca %struct.float16x4x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.float16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]]  = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP1:%.*]]  = bitcast i8* [[TMP0]] to <4 x half>*
-// CHECK:   [[VZIP0_I:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-// CHECK:   store <4 x half> [[VZIP0_I]], <4 x half>* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <4 x half>, <4 x half>* [[TMP1]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK:   store <4 x half> [[VZIP1_I]], <4 x half>* [[TMP2]]
+// CHECK-LABEL: define {{[^@]+}}@test_vzip_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL_I:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T:%.*]], align 8
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x half>*
+// CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK-NEXT:    store <4 x half> [[VZIP_I]], <4 x half>* [[TMP3]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds <4 x half>, <4 x half>* [[TMP3]], i32 1
+// CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK-NEXT:    store <4 x half> [[VZIP1_I]], <4 x half>* [[TMP4]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = load [[STRUCT_FLOAT16X4X2_T]], %struct.float16x4x2_t* [[RETVAL_I]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X2_T]], %struct.float16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue [[STRUCT_FLOAT16X4X2_T]] [[TMP5]], 0
+// CHECK-NEXT:    store [2 x <4 x half>] [[TMP7]], [2 x <4 x half>]* [[TMP6]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = load [[STRUCT_FLOAT16X4X2_T]], %struct.float16x4x2_t* [[RETVAL]], align 8
+// CHECK-NEXT:    ret [[STRUCT_FLOAT16X4X2_T]] [[TMP8]]
+//
 float16x4x2_t test_vzip_f16(float16x4_t a, float16x4_t b) {
   return vzip_f16(a, b);
 }
 
-// CHECK-LABEL: test_vzipq_f16
-// CHECK:   [[RETVAL:%.*]]  = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]]  = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP1:%.*]]  = bitcast i8* [[TMP0]] to <8 x half>*
-// CHECK:   [[VZIP0_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-// CHECK:   store <8 x half> [[VZIP0_I]], <8 x half>* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x half>, <8 x half>* [[TMP1]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-// CHECK:   store <8 x half> [[VZIP1_I]], <8 x half>* [[TMP2]]
+// CHECK-LABEL: define {{[^@]+}}@test_vzipq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL_I:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T:%.*]], align 16
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x half>*
+// CHECK-NEXT:    [[VZIP_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK-NEXT:    store <8 x half> [[VZIP_I]], <8 x half>* [[TMP3]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds <8 x half>, <8 x half>* [[TMP3]], i32 1
+// CHECK-NEXT:    [[VZIP1_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK-NEXT:    store <8 x half> [[VZIP1_I]], <8 x half>* [[TMP4]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load [[STRUCT_FLOAT16X8X2_T]], %struct.float16x8x2_t* [[RETVAL_I]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X2_T]], %struct.float16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue [[STRUCT_FLOAT16X8X2_T]] [[TMP5]], 0
+// CHECK-NEXT:    store [2 x <8 x half>] [[TMP7]], [2 x <8 x half>]* [[TMP6]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = load [[STRUCT_FLOAT16X8X2_T]], %struct.float16x8x2_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_FLOAT16X8X2_T]] [[TMP8]]
+//
 float16x8x2_t test_vzipq_f16(float16x8_t a, float16x8_t b) {
   return vzipq_f16(a, b);
 }
 
-// CHECK-LABEL: test_vuzp_f16
-// CHECK:   [[RETVAL:%.*]]  = alloca %struct.float16x4x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.float16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]]  = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP1:%.*]]  = bitcast i8* [[TMP0]] to <4 x half>*
-// CHECK:   [[VZIP0_I:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-// CHECK:   store <4 x half> [[VZIP0_I]], <4 x half>* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <4 x half>, <4 x half>* [[TMP1]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK:   store <4 x half> [[VZIP1_I]], <4 x half>* [[TMP2]]
+// CHECK-LABEL: define {{[^@]+}}@test_vuzp_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL_I:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T:%.*]], align 8
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x half>*
+// CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK-NEXT:    store <4 x half> [[VUZP_I]], <4 x half>* [[TMP3]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds <4 x half>, <4 x half>* [[TMP3]], i32 1
+// CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK-NEXT:    store <4 x half> [[VUZP1_I]], <4 x half>* [[TMP4]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = load [[STRUCT_FLOAT16X4X2_T]], %struct.float16x4x2_t* [[RETVAL_I]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X2_T]], %struct.float16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue [[STRUCT_FLOAT16X4X2_T]] [[TMP5]], 0
+// CHECK-NEXT:    store [2 x <4 x half>] [[TMP7]], [2 x <4 x half>]* [[TMP6]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = load [[STRUCT_FLOAT16X4X2_T]], %struct.float16x4x2_t* [[RETVAL]], align 8
+// CHECK-NEXT:    ret [[STRUCT_FLOAT16X4X2_T]] [[TMP8]]
+//
 float16x4x2_t test_vuzp_f16(float16x4_t a, float16x4_t b) {
   return vuzp_f16(a, b);
 }
 
-// CHECK-LABEL: test_vuzpq_f16
-// CHECK:   [[RETVAL:%.*]]  = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]]  = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP1:%.*]]  = bitcast i8* [[TMP0]] to <8 x half>*
-// CHECK:   [[VZIP0_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-// CHECK:   store <8 x half> [[VZIP0_I]], <8 x half>* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x half>, <8 x half>* [[TMP1]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-// CHECK:   store <8 x half> [[VZIP1_I]], <8 x half>* [[TMP2]]
+// CHECK-LABEL: define {{[^@]+}}@test_vuzpq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL_I:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T:%.*]], align 16
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x half>*
+// CHECK-NEXT:    [[VUZP_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK-NEXT:    store <8 x half> [[VUZP_I]], <8 x half>* [[TMP3]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds <8 x half>, <8 x half>* [[TMP3]], i32 1
+// CHECK-NEXT:    [[VUZP1_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK-NEXT:    store <8 x half> [[VUZP1_I]], <8 x half>* [[TMP4]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load [[STRUCT_FLOAT16X8X2_T]], %struct.float16x8x2_t* [[RETVAL_I]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X2_T]], %struct.float16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue [[STRUCT_FLOAT16X8X2_T]] [[TMP5]], 0
+// CHECK-NEXT:    store [2 x <8 x half>] [[TMP7]], [2 x <8 x half>]* [[TMP6]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = load [[STRUCT_FLOAT16X8X2_T]], %struct.float16x8x2_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_FLOAT16X8X2_T]] [[TMP8]]
+//
 float16x8x2_t test_vuzpq_f16(float16x8_t a, float16x8_t b) {
   return vuzpq_f16(a, b);
 }
 
-// CHECK-LABEL: test_vtrn_f16
-// CHECK:   [[RETVAL:%.*]]  = alloca %struct.float16x4x2_t, align 8
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.float16x4x2_t, align 8
-// CHECK:   [[TMP0:%.*]]  = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP1:%.*]]  = bitcast i8* [[TMP0]] to <4 x half>*
-// CHECK:   [[VZIP0_I:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK:   store <4 x half> [[VZIP0_I]], <4 x half>* [[TMP1]]
-// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <4 x half>, <4 x half>* [[TMP1]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK:   store <4 x half> [[VZIP1_I]], <4 x half>* [[TMP2]]
+// CHECK-LABEL: define {{[^@]+}}@test_vtrn_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL_I:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T:%.*]], align 8
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x half>*
+// CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK-NEXT:    store <4 x half> [[VTRN_I]], <4 x half>* [[TMP3]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds <4 x half>, <4 x half>* [[TMP3]], i32 1
+// CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK-NEXT:    store <4 x half> [[VTRN1_I]], <4 x half>* [[TMP4]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = load [[STRUCT_FLOAT16X4X2_T]], %struct.float16x4x2_t* [[RETVAL_I]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X2_T]], %struct.float16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue [[STRUCT_FLOAT16X4X2_T]] [[TMP5]], 0
+// CHECK-NEXT:    store [2 x <4 x half>] [[TMP7]], [2 x <4 x half>]* [[TMP6]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = load [[STRUCT_FLOAT16X4X2_T]], %struct.float16x4x2_t* [[RETVAL]], align 8
+// CHECK-NEXT:    ret [[STRUCT_FLOAT16X4X2_T]] [[TMP8]]
+//
 float16x4x2_t test_vtrn_f16(float16x4_t a, float16x4_t b) {
   return vtrn_f16(a, b);
 }
 
-// CHECK-LABEL: test_vtrnq_f16
-// CHECK:   [[RETVAL:%.*]]  = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[TMP0:%.*]]  = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP1:%.*]]  = bitcast i8* [[TMP0]] to <8 x half>*
-// CHECK:   [[VZIP0_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK:   store <8 x half> [[VZIP0_I]], <8 x half>* [[TMP1]] 
-// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x half>, <8 x half>* [[TMP1]], i32 1
-// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32>  <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK:   store <8 x half> [[VZIP1_I]], <8 x half>* [[TMP2]]
+// CHECK-LABEL: define {{[^@]+}}@test_vtrnq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL_I:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T:%.*]], align 16
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x half>*
+// CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK-NEXT:    store <8 x half> [[VTRN_I]], <8 x half>* [[TMP3]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds <8 x half>, <8 x half>* [[TMP3]], i32 1
+// CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK-NEXT:    store <8 x half> [[VTRN1_I]], <8 x half>* [[TMP4]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = load [[STRUCT_FLOAT16X8X2_T]], %struct.float16x8x2_t* [[RETVAL_I]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X2_T]], %struct.float16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue [[STRUCT_FLOAT16X8X2_T]] [[TMP5]], 0
+// CHECK-NEXT:    store [2 x <8 x half>] [[TMP7]], [2 x <8 x half>]* [[TMP6]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = load [[STRUCT_FLOAT16X8X2_T]], %struct.float16x8x2_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_FLOAT16X8X2_T]] [[TMP8]]
+//
 float16x8x2_t test_vtrnq_f16(float16x8_t a, float16x8_t b) {
   return vtrnq_f16(a, b);
 }
 
-// CHECK-LABEL: test_vmov_n_f16
-// CHECK:   [[TMP0:%.*]] = insertelement <4 x half> undef, half %a, i32 0
-// CHECK:   [[TMP1:%.*]] = insertelement <4 x half> [[TMP0]], half %a, i32 1
-// CHECK:   [[TMP2:%.*]] = insertelement <4 x half> [[TMP1]], half %a, i32 2
-// CHECK:   [[TMP3:%.*]] = insertelement <4 x half> [[TMP2]], half %a, i32 3
-// CHECK:   ret <4 x half> [[TMP3]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmov_n_f16
+// CHECK-SAME: (half noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[A]], i32 0
+// CHECK-NEXT:    [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[A]], i32 1
+// CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[A]], i32 2
+// CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[A]], i32 3
+// CHECK-NEXT:    ret <4 x half> [[VECINIT3]]
+//
 float16x4_t test_vmov_n_f16(float16_t a) {
   return vmov_n_f16(a);
 }
 
-// CHECK-LABEL: test_vmovq_n_f16
-// CHECK:   [[TMP0:%.*]] = insertelement <8 x half> undef, half %a, i32 0
-// CHECK:   [[TMP1:%.*]] = insertelement <8 x half> [[TMP0]], half %a, i32 1
-// CHECK:   [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half %a, i32 2
-// CHECK:   [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half %a, i32 3
-// CHECK:   [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half %a, i32 4
-// CHECK:   [[TMP5:%.*]] = insertelement <8 x half> [[TMP4]], half %a, i32 5
-// CHECK:   [[TMP6:%.*]] = insertelement <8 x half> [[TMP5]], half %a, i32 6
-// CHECK:   [[TMP7:%.*]] = insertelement <8 x half> [[TMP6]], half %a, i32 7
-// CHECK:   ret <8 x half> [[TMP7]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmovq_n_f16
+// CHECK-SAME: (half noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[A]], i32 0
+// CHECK-NEXT:    [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[A]], i32 1
+// CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[A]], i32 2
+// CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[A]], i32 3
+// CHECK-NEXT:    [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[A]], i32 4
+// CHECK-NEXT:    [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[A]], i32 5
+// CHECK-NEXT:    [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[A]], i32 6
+// CHECK-NEXT:    [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[A]], i32 7
+// CHECK-NEXT:    ret <8 x half> [[VECINIT7]]
+//
 float16x8_t test_vmovq_n_f16(float16_t a) {
   return vmovq_n_f16(a);
 }
 
-// CHECK-LABEL: test_vdup_n_f16
-// CHECK:   [[TMP0:%.*]] = insertelement <4 x half> undef, half %a, i32 0
-// CHECK:   [[TMP1:%.*]] = insertelement <4 x half> [[TMP0]], half %a, i32 1
-// CHECK:   [[TMP2:%.*]] = insertelement <4 x half> [[TMP1]], half %a, i32 2
-// CHECK:   [[TMP3:%.*]] = insertelement <4 x half> [[TMP2]], half %a, i32 3
-// CHECK:   ret <4 x half> [[TMP3]]
+// CHECK-LABEL: define {{[^@]+}}@test_vdup_n_f16
+// CHECK-SAME: (half noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[A]], i32 0
+// CHECK-NEXT:    [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[A]], i32 1
+// CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[A]], i32 2
+// CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[A]], i32 3
+// CHECK-NEXT:    ret <4 x half> [[VECINIT3]]
+//
 float16x4_t test_vdup_n_f16(float16_t a) {
   return vdup_n_f16(a);
 }
 
-// CHECK-LABEL: test_vdupq_n_f16
-// CHECK:   [[TMP0:%.*]] = insertelement <8 x half> undef, half %a, i32 0
-// CHECK:   [[TMP1:%.*]] = insertelement <8 x half> [[TMP0]], half %a, i32 1
-// CHECK:   [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half %a, i32 2
-// CHECK:   [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half %a, i32 3
-// CHECK:   [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half %a, i32 4
-// CHECK:   [[TMP5:%.*]] = insertelement <8 x half> [[TMP4]], half %a, i32 5
-// CHECK:   [[TMP6:%.*]] = insertelement <8 x half> [[TMP5]], half %a, i32 6
-// CHECK:   [[TMP7:%.*]] = insertelement <8 x half> [[TMP6]], half %a, i32 7
-// CHECK:   ret <8 x half> [[TMP7]]
+// CHECK-LABEL: define {{[^@]+}}@test_vdupq_n_f16
+// CHECK-SAME: (half noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[A]], i32 0
+// CHECK-NEXT:    [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[A]], i32 1
+// CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[A]], i32 2
+// CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[A]], i32 3
+// CHECK-NEXT:    [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[A]], i32 4
+// CHECK-NEXT:    [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[A]], i32 5
+// CHECK-NEXT:    [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[A]], i32 6
+// CHECK-NEXT:    [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[A]], i32 7
+// CHECK-NEXT:    ret <8 x half> [[VECINIT7]]
+//
 float16x8_t test_vdupq_n_f16(float16_t a) {
   return vdupq_n_f16(a);
 }
 
-// CHECK-LABEL: test_vdup_lane_f16
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> [[A:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK:   ret <4 x half> [[LANE]]
+// CHECK-LABEL: define {{[^@]+}}@test_vdup_lane_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    ret <4 x half> [[LANE]]
+//
 float16x4_t test_vdup_lane_f16(float16x4_t a) {
   return vdup_lane_f16(a, 3);
 }
 
-// CHECK-LABEL: test_vdupq_lane_f16
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> [[A:%.*]] to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK:   ret <8 x half> [[LANE]]
+// CHECK-LABEL: define {{[^@]+}}@test_vdupq_lane_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK-NEXT:    ret <8 x half> [[LANE]]
+//
 float16x8_t test_vdupq_lane_f16(float16x4_t a) {
   return vdupq_lane_f16(a, 3);
 }
 
-// CHECK-LABEL: @test_vext_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
-// CHECK:   [[VEXT:%.*]] = shufflevector <4 x half> [[TMP2]], <4 x half> [[TMP3]], <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-// CHECK:   ret <4 x half> [[VEXT]]
+// CHECK-LABEL: define {{[^@]+}}@test_vext_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <4 x half> [[TMP2]], <4 x half> [[TMP3]], <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+// CHECK-NEXT:    ret <4 x half> [[VEXT]]
+//
 float16x4_t test_vext_f16(float16x4_t a, float16x4_t b) {
   return vext_f16(a, b, 2);
 }
 
-// CHECK-LABEL: @test_vextq_f16(
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
-// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
-// CHECK:   [[VEXT:%.*]] = shufflevector <8 x half> [[TMP2]], <8 x half> [[TMP3]], <8 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12>
-// CHECK:   ret <8 x half> [[VEXT]]
+// CHECK-LABEL: define {{[^@]+}}@test_vextq_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
+// CHECK-NEXT:    [[VEXT:%.*]] = shufflevector <8 x half> [[TMP2]], <8 x half> [[TMP3]], <8 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12>
+// CHECK-NEXT:    ret <8 x half> [[VEXT]]
+//
 float16x8_t test_vextq_f16(float16x8_t a, float16x8_t b) {
   return vextq_f16(a, b, 5);
 }
 
-// CHECK-LABEL: @test_vrev64_f16(
-// CHECK:   [[SHFL:%.*]] = shufflevector <4 x half> %a, <4 x half> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-// CHECK:   ret <4 x half> [[SHFL]]
+// CHECK-LABEL: define {{[^@]+}}@test_vrev64_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[A]], <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+// CHECK-NEXT:    ret <4 x half> [[SHUFFLE_I]]
+//
 float16x4_t test_vrev64_f16(float16x4_t a) {
   return vrev64_f16(a);
 }
 
-// CHECK-LABEL: @test_vrev64q_f16(
-// CHECK:   [[SHFL:%.*]] = shufflevector <8 x half> %a, <8 x half> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-// CHECK:   ret <8 x half> [[SHFL]]
+// CHECK-LABEL: define {{[^@]+}}@test_vrev64q_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[A]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK-NEXT:    ret <8 x half> [[SHUFFLE_I]]
+//
 float16x8_t test_vrev64q_f16(float16x8_t a) {
   return vrev64q_f16(a);
 }
 
-// CHECK-LABEL: @test_vzip1_f16(
-// CHECK:   [[SHFL:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-// CHECK:   ret <4 x half> [[SHFL]]
+// CHECK-LABEL: define {{[^@]+}}@test_vzip1_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK-NEXT:    ret <4 x half> [[SHUFFLE_I]]
+//
 float16x4_t test_vzip1_f16(float16x4_t a, float16x4_t b) {
   return vzip1_f16(a, b);
 }
 
-// CHECK-LABEL: @test_vzip1q_f16(
-// CHECK:   [[SHFL:%.*]] = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-// CHECK:   ret <8 x half> [[SHFL]]
+// CHECK-LABEL: define {{[^@]+}}@test_vzip1q_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK-NEXT:    ret <8 x half> [[SHUFFLE_I]]
+//
 float16x8_t test_vzip1q_f16(float16x8_t a, float16x8_t b) {
   return vzip1q_f16(a, b);
 }
 
-// CHECK-LABEL: @test_vzip2_f16(
-// CHECK:   [[SHFL:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
-// CHECK:   ret <4 x half> [[SHFL]]
+// CHECK-LABEL: define {{[^@]+}}@test_vzip2_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK-NEXT:    ret <4 x half> [[SHUFFLE_I]]
+//
 float16x4_t test_vzip2_f16(float16x4_t a, float16x4_t b) {
   return vzip2_f16(a, b);
 }
 
-// CHECK-LABEL: @test_vzip2q_f16(
-// CHECK:   [[SHFL:%.*]] = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
-// CHECK:   ret <8 x half> [[SHFL]]
+// CHECK-LABEL: define {{[^@]+}}@test_vzip2q_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK-NEXT:    ret <8 x half> [[SHUFFLE_I]]
+//
 float16x8_t test_vzip2q_f16(float16x8_t a, float16x8_t b) {
   return vzip2q_f16(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp1_f16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-// CHECK:   ret <4 x half> [[SHUFFLE_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vuzp1_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK-NEXT:    ret <4 x half> [[SHUFFLE_I]]
+//
 float16x4_t test_vuzp1_f16(float16x4_t a, float16x4_t b) {
   return vuzp1_f16(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp1q_f16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
-// CHECK:   ret <8 x half> [[SHUFFLE_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vuzp1q_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK-NEXT:    ret <8 x half> [[SHUFFLE_I]]
+//
 float16x8_t test_vuzp1q_f16(float16x8_t a, float16x8_t b) {
   return vuzp1q_f16(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp2_f16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-// CHECK:   ret <4 x half> [[SHUFFLE_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vuzp2_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK-NEXT:    ret <4 x half> [[SHUFFLE_I]]
+//
 float16x4_t test_vuzp2_f16(float16x4_t a, float16x4_t b) {
   return vuzp2_f16(a, b);
 }
 
-// CHECK-LABEL: @test_vuzp2q_f16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-// CHECK:   ret <8 x half> [[SHUFFLE_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vuzp2q_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK-NEXT:    ret <8 x half> [[SHUFFLE_I]]
+//
 float16x8_t test_vuzp2q_f16(float16x8_t a, float16x8_t b) {
   return vuzp2q_f16(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn1_f16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK:   ret <4 x half> [[SHUFFLE_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vtrn1_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK-NEXT:    ret <4 x half> [[SHUFFLE_I]]
+//
 float16x4_t test_vtrn1_f16(float16x4_t a, float16x4_t b) {
   return vtrn1_f16(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn1q_f16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32>  <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK:   ret <8 x half> [[SHUFFLE_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vtrn1q_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK-NEXT:    ret <8 x half> [[SHUFFLE_I]]
+//
 float16x8_t test_vtrn1q_f16(float16x8_t a, float16x8_t b) {
   return vtrn1q_f16(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn2_f16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK:   ret <4 x half> [[SHUFFLE_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vtrn2_f16
+// CHECK-SAME: (<4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x half> [[A]], <4 x half> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK-NEXT:    ret <4 x half> [[SHUFFLE_I]]
+//
 float16x4_t test_vtrn2_f16(float16x4_t a, float16x4_t b) {
   return vtrn2_f16(a, b);
 }
 
-// CHECK-LABEL: @test_vtrn2q_f16(
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK:   ret <8 x half> [[SHUFFLE_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vtrn2q_f16
+// CHECK-SAME: (<8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x half> [[A]], <8 x half> [[B]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK-NEXT:    ret <8 x half> [[SHUFFLE_I]]
+//
 float16x8_t test_vtrn2q_f16(float16x8_t a, float16x8_t b) {
   return vtrn2q_f16(a, b);
 }
 
-// CHECK-LABEL: @test_vduph_laneq_f16(
-// CHECK:        [[V:%.*]] = extractelement <8 x half> [[V2:%.*]], i32 7
-// CHECK-NEXT:   ret half [[V]]
+// CHECK-LABEL: define {{[^@]+}}@test_vduph_laneq_f16
+// CHECK-SAME: (<8 x half> noundef [[VEC:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <8 x half> [[VEC]], i32 7
+// CHECK-NEXT:    ret half [[VGETQ_LANE]]
+//
 float16_t test_vduph_laneq_f16(float16x8_t vec) {
   return vduph_laneq_f16(vec, 7);
 }
 
-// CHECK-LABEL: @test_vduph_lane_f16(
-// CHECK:        [[V:%.*]] = extractelement <4 x half> [[V2:%.*]], i32 3
-// CHECK-NEXT:   ret half [[V]]
+// CHECK-LABEL: define {{[^@]+}}@test_vduph_lane_f16
+// CHECK-SAME: (<4 x half> noundef [[VEC:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x half> [[VEC]], i32 3
+// CHECK-NEXT:    ret half [[VGET_LANE]]
+//
 float16_t test_vduph_lane_f16(float16x4_t vec) {
   return vduph_lane_f16(vec, 3);
 }

diff  --git a/clang/test/CodeGen/arm-neon-fma.c b/clang/test/CodeGen/arm-neon-fma.c
index 9f74584d0c03b..cd534f9cd1879 100644
--- a/clang/test/CodeGen/arm-neon-fma.c
+++ b/clang/test/CodeGen/arm-neon-fma.c
@@ -1,3 +1,4 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature
 // RUN: %clang_cc1 -triple thumbv7-none-linux-gnueabihf \
 // RUN:   -target-abi aapcs \
 // RUN:   -target-cpu cortex-a7 \
@@ -9,43 +10,61 @@
 
 #include <arm_neon.h>
 
-// CHECK-LABEL: define{{.*}} <2 x float> @test_fma_order(<2 x float> noundef %accum, <2 x float> noundef %lhs, <2 x float> noundef %rhs) #0 {
-// CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> %lhs, <2 x float> %rhs, <2 x float> %accum) #3
-// CHECK:   ret <2 x float> [[TMP6]]
+// CHECK-LABEL: define {{[^@]+}}@test_fma_order
+// CHECK-SAME: (<2 x float> noundef [[ACCUM:%.*]], <2 x float> noundef [[LHS:%.*]], <2 x float> noundef [[RHS:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[ACCUM]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[LHS]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[RHS]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LHS]], <2 x float> [[RHS]], <2 x float> [[ACCUM]]) #[[ATTR3:[0-9]+]]
+// CHECK-NEXT:    ret <2 x float> [[TMP3]]
+//
 float32x2_t test_fma_order(float32x2_t accum, float32x2_t lhs, float32x2_t rhs) {
   return vfma_f32(accum, lhs, rhs);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x float> @test_fmaq_order(<4 x float> noundef %accum, <4 x float> noundef %lhs, <4 x float> noundef %rhs) #1 {
-// CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %lhs, <4 x float> %rhs, <4 x float> %accum) #3
-// CHECK:   ret <4 x float> [[TMP6]]
+// CHECK-LABEL: define {{[^@]+}}@test_fmaq_order
+// CHECK-SAME: (<4 x float> noundef [[ACCUM:%.*]], <4 x float> noundef [[LHS:%.*]], <4 x float> noundef [[RHS:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[ACCUM]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[LHS]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[RHS]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LHS]], <4 x float> [[RHS]], <4 x float> [[ACCUM]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <4 x float> [[TMP3]]
+//
 float32x4_t test_fmaq_order(float32x4_t accum, float32x4_t lhs, float32x4_t rhs) {
   return vfmaq_f32(accum, lhs, rhs);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x float> @test_vfma_n_f32(<2 x float> noundef %a, <2 x float> noundef %b, float noundef %n) #0 {
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %n, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %n, i32 1
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> %b, <2 x float> [[VECINIT1_I]], <2 x float> %a)
-// CHECK:   ret <2 x float> [[TMP3]]
+// CHECK-LABEL: define {{[^@]+}}@test_vfma_n_f32
+// CHECK-SAME: (<2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]], float noundef [[N:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float [[N]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float [[N]], i32 1
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[B]], <2 x float> [[VECINIT1_I]], <2 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <2 x float> [[TMP3]]
+//
 float32x2_t test_vfma_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
   return vfma_n_f32(a, b, n);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x float> @test_vfmaq_n_f32(<4 x float> noundef %a, <4 x float> noundef %b, float noundef %n) #1 {
-// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %n, i32 0
-// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %n, i32 1
-// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %n, i32 2
-// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %n, i32 3
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8>
-// CHECK:   [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %b, <4 x float> [[VECINIT3_I]], <4 x float> %a)
-// CHECK:   ret <4 x float> [[TMP3]]
+// CHECK-LABEL: define {{[^@]+}}@test_vfmaq_n_f32
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]], float noundef [[N:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float [[N]], i32 0
+// CHECK-NEXT:    [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float [[N]], i32 1
+// CHECK-NEXT:    [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float [[N]], i32 2
+// CHECK-NEXT:    [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float [[N]], i32 3
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[B]], <4 x float> [[VECINIT3_I]], <4 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <4 x float> [[TMP3]]
+//
 float32x4_t test_vfmaq_n_f32(float32x4_t a, float32x4_t b, float32_t n) {
   return vfmaq_n_f32(a, b, n);
 }
 
-// CHECK: attributes #0 ={{.*}}"min-legal-vector-width"="64"
-// CHECK: attributes #1 ={{.*}}"min-legal-vector-width"="128"

diff  --git a/clang/test/CodeGen/arm-neon-numeric-maxmin.c b/clang/test/CodeGen/arm-neon-numeric-maxmin.c
index a2171761c90b6..849f5adebeda4 100644
--- a/clang/test/CodeGen/arm-neon-numeric-maxmin.c
+++ b/clang/test/CodeGen/arm-neon-numeric-maxmin.c
@@ -1,36 +1,59 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature
 // RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 -ffreestanding -disable-O0-optnone -emit-llvm %s -o - | opt -S -mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
 #include <arm_neon.h>
 
-// CHECK-LABEL: define{{.*}} <2 x float> @test_vmaxnm_f32(<2 x float> noundef %a, <2 x float> noundef %b) #0 {
-// CHECK:   [[VMAXNM_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vmaxnm.v2f32(<2 x float> %a, <2 x float> %b) #3
-// CHECK:   ret <2 x float> [[VMAXNM_V2_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmaxnm_f32
+// CHECK-SAME: (<2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VMAXNM_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vmaxnm.v2f32(<2 x float> [[A]], <2 x float> [[B]]) #[[ATTR3:[0-9]+]]
+// CHECK-NEXT:    [[VMAXNM_V3_I:%.*]] = bitcast <2 x float> [[VMAXNM_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x float> [[VMAXNM_V2_I]]
+//
 float32x2_t test_vmaxnm_f32(float32x2_t a, float32x2_t b) {
   return vmaxnm_f32(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x float> @test_vmaxnmq_f32(<4 x float> noundef %a, <4 x float> noundef %b) #1 {
-// CHECK:   [[VMAXNMQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vmaxnm.v4f32(<4 x float> %a, <4 x float> %b) #3
-// CHECK:   ret <4 x float> [[VMAXNMQ_V2_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vmaxnmq_f32
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VMAXNMQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vmaxnm.v4f32(<4 x float> [[A]], <4 x float> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VMAXNMQ_V3_I:%.*]] = bitcast <4 x float> [[VMAXNMQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <4 x float> [[VMAXNMQ_V2_I]]
+//
 float32x4_t test_vmaxnmq_f32(float32x4_t a, float32x4_t b) {
   return vmaxnmq_f32(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x float> @test_vminnm_f32(<2 x float> noundef %a, <2 x float> noundef %b) #0 {
-// CHECK:   [[VMINNM_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vminnm.v2f32(<2 x float> %a, <2 x float> %b) #3
-// CHECK:   ret <2 x float> [[VMINNM_V2_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vminnm_f32
+// CHECK-SAME: (<2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
+// CHECK-NEXT:    [[VMINNM_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vminnm.v2f32(<2 x float> [[A]], <2 x float> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VMINNM_V3_I:%.*]] = bitcast <2 x float> [[VMINNM_V2_I]] to <8 x i8>
+// CHECK-NEXT:    ret <2 x float> [[VMINNM_V2_I]]
+//
 float32x2_t test_vminnm_f32(float32x2_t a, float32x2_t b) {
   return vminnm_f32(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x float> @test_vminnmq_f32(<4 x float> noundef %a, <4 x float> noundef %b) #1 {
-// CHECK:   [[VMINNMQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vminnm.v4f32(<4 x float> %a, <4 x float> %b) #3
-// CHECK:   ret <4 x float> [[VMINNMQ_V2_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vminnmq_f32
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <16 x i8>
+// CHECK-NEXT:    [[VMINNMQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vminnm.v4f32(<4 x float> [[A]], <4 x float> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VMINNMQ_V3_I:%.*]] = bitcast <4 x float> [[VMINNMQ_V2_I]] to <16 x i8>
+// CHECK-NEXT:    ret <4 x float> [[VMINNMQ_V2_I]]
+//
 float32x4_t test_vminnmq_f32(float32x4_t a, float32x4_t b) {
   return vminnmq_f32(a, b);
 }
 
-// CHECK: attributes #0 ={{.*}}"min-legal-vector-width"="64"
-// CHECK: attributes #1 ={{.*}}"min-legal-vector-width"="128"

diff  --git a/clang/test/CodeGen/arm-neon-vcvtX.c b/clang/test/CodeGen/arm-neon-vcvtX.c
index e7f6d42fa1deb..71f8e5bca79da 100644
--- a/clang/test/CodeGen/arm-neon-vcvtX.c
+++ b/clang/test/CodeGen/arm-neon-vcvtX.c
@@ -1,120 +1,183 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature
 // RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 -ffreestanding -disable-O0-optnone -emit-llvm %s -o - | opt -S -mem2reg | FileCheck %s
 
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
 #include <arm_neon.h>
 
-// CHECK-LABEL: define{{.*}} <2 x i32> @test_vcvta_s32_f32(<2 x float> noundef %a) #0 {
-// CHECK:   [[VCVTA_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtas.v2i32.v2f32(<2 x float> %a) #3
-// CHECK:   ret <2 x i32> [[VCVTA_S32_V1_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvta_s32_f32
+// CHECK-SAME: (<2 x float> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VCVTA_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtas.v2i32.v2f32(<2 x float> [[A]]) #[[ATTR3:[0-9]+]]
+// CHECK-NEXT:    ret <2 x i32> [[VCVTA_S32_V1_I]]
+//
 int32x2_t test_vcvta_s32_f32(float32x2_t a) {
   return vcvta_s32_f32(a);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i32> @test_vcvta_u32_f32(<2 x float> noundef %a) #0 {
-// CHECK:   [[VCVTA_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtau.v2i32.v2f32(<2 x float> %a) #3
-// CHECK:   ret <2 x i32> [[VCVTA_U32_V1_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvta_u32_f32
+// CHECK-SAME: (<2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VCVTA_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtau.v2i32.v2f32(<2 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <2 x i32> [[VCVTA_U32_V1_I]]
+//
 uint32x2_t test_vcvta_u32_f32(float32x2_t a) {
   return vcvta_u32_f32(a);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x i32> @test_vcvtaq_s32_f32(<4 x float> noundef %a) #1 {
-// CHECK:   [[VCVTAQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtas.v4i32.v4f32(<4 x float> %a) #3
-// CHECK:   ret <4 x i32> [[VCVTAQ_S32_V1_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtaq_s32_f32
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VCVTAQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtas.v4i32.v4f32(<4 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <4 x i32> [[VCVTAQ_S32_V1_I]]
+//
 int32x4_t test_vcvtaq_s32_f32(float32x4_t a) {
   return vcvtaq_s32_f32(a);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x i32> @test_vcvtaq_u32_f32(<4 x float> noundef %a) #1 {
-// CHECK:   [[VCVTAQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtau.v4i32.v4f32(<4 x float> %a) #3
-// CHECK:   ret <4 x i32> [[VCVTAQ_U32_V1_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtaq_u32_f32
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VCVTAQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtau.v4i32.v4f32(<4 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <4 x i32> [[VCVTAQ_U32_V1_I]]
+//
 uint32x4_t test_vcvtaq_u32_f32(float32x4_t a) {
   return vcvtaq_u32_f32(a);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i32> @test_vcvtn_s32_f32(<2 x float> noundef %a) #0 {
-// CHECK:   [[VCVTN_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtns.v2i32.v2f32(<2 x float> %a) #3
-// CHECK:   ret <2 x i32> [[VCVTN_S32_V1_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtn_s32_f32
+// CHECK-SAME: (<2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VCVTN_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtns.v2i32.v2f32(<2 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <2 x i32> [[VCVTN_S32_V1_I]]
+//
 int32x2_t test_vcvtn_s32_f32(float32x2_t a) {
   return vcvtn_s32_f32(a);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i32> @test_vcvtn_u32_f32(<2 x float> noundef %a) #0 {
-// CHECK:   [[VCVTN_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtnu.v2i32.v2f32(<2 x float> %a) #3
-// CHECK:   ret <2 x i32> [[VCVTN_U32_V1_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtn_u32_f32
+// CHECK-SAME: (<2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VCVTN_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtnu.v2i32.v2f32(<2 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <2 x i32> [[VCVTN_U32_V1_I]]
+//
 uint32x2_t test_vcvtn_u32_f32(float32x2_t a) {
   return vcvtn_u32_f32(a);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x i32> @test_vcvtnq_s32_f32(<4 x float> noundef %a) #1 {
-// CHECK:   [[VCVTNQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtns.v4i32.v4f32(<4 x float> %a) #3
-// CHECK:   ret <4 x i32> [[VCVTNQ_S32_V1_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtnq_s32_f32
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VCVTNQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtns.v4i32.v4f32(<4 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <4 x i32> [[VCVTNQ_S32_V1_I]]
+//
 int32x4_t test_vcvtnq_s32_f32(float32x4_t a) {
   return vcvtnq_s32_f32(a);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x i32> @test_vcvtnq_u32_f32(<4 x float> noundef %a) #1 {
-// CHECK:   [[VCVTNQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtnu.v4i32.v4f32(<4 x float> %a) #3
-// CHECK:   ret <4 x i32> [[VCVTNQ_U32_V1_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtnq_u32_f32
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VCVTNQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtnu.v4i32.v4f32(<4 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <4 x i32> [[VCVTNQ_U32_V1_I]]
+//
 uint32x4_t test_vcvtnq_u32_f32(float32x4_t a) {
   return vcvtnq_u32_f32(a);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i32> @test_vcvtp_s32_f32(<2 x float> noundef %a) #0 {
-// CHECK:   [[VCVTP_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtps.v2i32.v2f32(<2 x float> %a) #3
-// CHECK:   ret <2 x i32> [[VCVTP_S32_V1_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtp_s32_f32
+// CHECK-SAME: (<2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VCVTP_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtps.v2i32.v2f32(<2 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <2 x i32> [[VCVTP_S32_V1_I]]
+//
 int32x2_t test_vcvtp_s32_f32(float32x2_t a) {
   return vcvtp_s32_f32(a);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i32> @test_vcvtp_u32_f32(<2 x float> noundef %a) #0 {
-// CHECK:   [[VCVTP_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtpu.v2i32.v2f32(<2 x float> %a) #3
-// CHECK:   ret <2 x i32> [[VCVTP_U32_V1_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtp_u32_f32
+// CHECK-SAME: (<2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VCVTP_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtpu.v2i32.v2f32(<2 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <2 x i32> [[VCVTP_U32_V1_I]]
+//
 uint32x2_t test_vcvtp_u32_f32(float32x2_t a) {
   return vcvtp_u32_f32(a);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x i32> @test_vcvtpq_s32_f32(<4 x float> noundef %a) #1 {
-// CHECK:   [[VCVTPQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtps.v4i32.v4f32(<4 x float> %a) #3
-// CHECK:   ret <4 x i32> [[VCVTPQ_S32_V1_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtpq_s32_f32
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VCVTPQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtps.v4i32.v4f32(<4 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <4 x i32> [[VCVTPQ_S32_V1_I]]
+//
 int32x4_t test_vcvtpq_s32_f32(float32x4_t a) {
   return vcvtpq_s32_f32(a);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x i32> @test_vcvtpq_u32_f32(<4 x float> noundef %a) #1 {
-// CHECK:   [[VCVTPQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtpu.v4i32.v4f32(<4 x float> %a) #3
-// CHECK:   ret <4 x i32> [[VCVTPQ_U32_V1_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtpq_u32_f32
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VCVTPQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtpu.v4i32.v4f32(<4 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <4 x i32> [[VCVTPQ_U32_V1_I]]
+//
 uint32x4_t test_vcvtpq_u32_f32(float32x4_t a) {
   return vcvtpq_u32_f32(a);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i32> @test_vcvtm_s32_f32(<2 x float> noundef %a) #0 {
-// CHECK:   [[VCVTM_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtms.v2i32.v2f32(<2 x float> %a) #3
-// CHECK:   ret <2 x i32> [[VCVTM_S32_V1_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtm_s32_f32
+// CHECK-SAME: (<2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VCVTM_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtms.v2i32.v2f32(<2 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <2 x i32> [[VCVTM_S32_V1_I]]
+//
 int32x2_t test_vcvtm_s32_f32(float32x2_t a) {
   return vcvtm_s32_f32(a);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i32> @test_vcvtm_u32_f32(<2 x float> noundef %a) #0 {
-// CHECK:   [[VCVTM_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtmu.v2i32.v2f32(<2 x float> %a) #3
-// CHECK:   ret <2 x i32> [[VCVTM_U32_V1_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtm_u32_f32
+// CHECK-SAME: (<2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
+// CHECK-NEXT:    [[VCVTM_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtmu.v2i32.v2f32(<2 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <2 x i32> [[VCVTM_U32_V1_I]]
+//
 uint32x2_t test_vcvtm_u32_f32(float32x2_t a) {
   return vcvtm_u32_f32(a);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x i32> @test_vcvtmq_s32_f32(<4 x float> noundef %a) #1 {
-// CHECK:   [[VCVTMQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtms.v4i32.v4f32(<4 x float> %a) #3
-// CHECK:   ret <4 x i32> [[VCVTMQ_S32_V1_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtmq_s32_f32
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VCVTMQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtms.v4i32.v4f32(<4 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <4 x i32> [[VCVTMQ_S32_V1_I]]
+//
 int32x4_t test_vcvtmq_s32_f32(float32x4_t a) {
   return vcvtmq_s32_f32(a);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x i32> @test_vcvtmq_u32_f32(<4 x float> noundef %a) #1 {
-// CHECK:   [[VCVTMQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtmu.v4i32.v4f32(<4 x float> %a) #3
-// CHECK:   ret <4 x i32> [[VCVTMQ_U32_V1_I]]
+// CHECK-LABEL: define {{[^@]+}}@test_vcvtmq_u32_f32
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
+// CHECK-NEXT:    [[VCVTMQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtmu.v4i32.v4f32(<4 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    ret <4 x i32> [[VCVTMQ_U32_V1_I]]
+//
 uint32x4_t test_vcvtmq_u32_f32(float32x4_t a) {
   return vcvtmq_u32_f32(a);
 }
 
-// CHECK: attributes #0 ={{.*}}"min-legal-vector-width"="64"
-// CHECK: attributes #1 ={{.*}}"min-legal-vector-width"="128"


        


More information about the cfe-commits mailing list