[clang] 053dc95 - Reduce the number of attributes attached to each function

Tue Feb 23 22:09:03 PST 2021

Author: Dávid Bolvanský
Date: 2021-02-24T07:08:44+01:00
New Revision: 053dc95839b3b8a36db46f8c419e36e632e989cd

URL: https://github.com/llvm/llvm-project/commit/053dc95839b3b8a36db46f8c419e36e632e989cd
DIFF: https://github.com/llvm/llvm-project/commit/053dc95839b3b8a36db46f8c419e36e632e989cd.diff

LOG: Reduce the number of attributes attached to each function

Patch takes advantage of the implicit default behavior to reduce the number of attributes, which in turns reduces compilation time.

Reviewed By: serge-sans-paille

Differential Revision: https://reviews.llvm.org/D97116

Added: 
    

Modified: 
    clang/lib/CodeGen/CodeGenFunction.cpp
    clang/test/CodeGen/aarch64-neon-ldst-one.c
    clang/test/CodeGen/aarch64-neon-scalar-x-indexed-elem.c
    clang/test/CodeGen/aarch64-poly128.c
    clang/test/CodeGen/aarch64-poly64.c
    clang/test/CodeGenCXX/dllexport-ctor-closure.cpp
    clang/test/CodeGenCXX/dllexport.cpp

Removed: 
    


################################################################################
diff  --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index 5888866b2546..ef7f9fb25e7f 100644

--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -493,7 +493,8 @@ void CodeGenFunction::FinishFunction(SourceLocation EndLoc) {
   // 4. Width of vector arguments and return types for this function.
   // 5. Width of vector aguments and return types for functions called by this
   //    function.
-  CurFn->addFnAttr("min-legal-vector-width", llvm::utostr(LargestVectorWidth));
+  if (LargestVectorWidth)
+    CurFn->addFnAttr("min-legal-vector-width", llvm::utostr(LargestVectorWidth));
 
   // If we generated an unreachable return block, delete it now.
   if (ReturnBlock.isValid() && ReturnBlock.getBlock()->use_empty()) {

diff  --git a/clang/test/CodeGen/aarch64-neon-ldst-one.c b/clang/test/CodeGen/aarch64-neon-ldst-one.c
index db352fecf221..e4fce996a1f3 100644
--- a/clang/test/CodeGen/aarch64-neon-ldst-one.c
+++ b/clang/test/CodeGen/aarch64-neon-ldst-one.c
@@ -1,6803 +1,7341 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
 // RUN:  -disable-O0-optnone -fallow-half-arguments-and-returns -emit-llvm -o - %s \
 // RUN: | opt -S -mem2reg | FileCheck %s
 
 #include <arm_neon.h>
 
-// CHECK-LABEL: define{{.*}} <16 x i8> @test_vld1q_dup_u8(i8* %a) #0 {
-// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
-// CHECK:   [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
-// CHECK:   ret <16 x i8> [[LANE]]
+// CHECK-LABEL: @test_vld1q_dup_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[A:%.*]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
+// CHECK-NEXT:    ret <16 x i8> [[LANE]]
+//
 uint8x16_t test_vld1q_dup_u8(uint8_t  *a) {
   return vld1q_dup_u8(a);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i16> @test_vld1q_dup_u16(i16* %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
-// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
-// CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
-// CHECK:   ret <8 x i16> [[LANE]]
+// CHECK-LABEL: @test_vld1q_dup_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK-NEXT:    [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
+// CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
+// CHECK-NEXT:    ret <8 x i16> [[LANE]]
+//
 uint16x8_t test_vld1q_dup_u16(uint16_t  *a) {
   return vld1q_dup_u16(a);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x i32> @test_vld1q_dup_u32(i32* %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
-// CHECK:   [[TMP2:%.*]] = load i32, i32* [[TMP1]]
-// CHECK:   [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer
-// CHECK:   ret <4 x i32> [[LANE]]
+// CHECK-LABEL: @test_vld1q_dup_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2]], i32 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    ret <4 x i32> [[LANE]]
+//
 uint32x4_t test_vld1q_dup_u32(uint32_t  *a) {
   return vld1q_dup_u32(a);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i64> @test_vld1q_dup_u64(i64* %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
-// CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]]
-// CHECK:   [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer
-// CHECK:   ret <2 x i64> [[LANE]]
+// CHECK-LABEL: @test_vld1q_dup_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* [[TMP1]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i32 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    ret <2 x i64> [[LANE]]
+//
 uint64x2_t test_vld1q_dup_u64(uint64_t  *a) {
   return vld1q_dup_u64(a);
 }
 
-// CHECK-LABEL: define{{.*}} <16 x i8> @test_vld1q_dup_s8(i8* %a) #0 {
-// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
-// CHECK:   [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
-// CHECK:   ret <16 x i8> [[LANE]]
+// CHECK-LABEL: @test_vld1q_dup_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[A:%.*]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
+// CHECK-NEXT:    ret <16 x i8> [[LANE]]
+//
 int8x16_t test_vld1q_dup_s8(int8_t  *a) {
   return vld1q_dup_s8(a);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i16> @test_vld1q_dup_s16(i16* %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
-// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
-// CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
-// CHECK:   ret <8 x i16> [[LANE]]
+// CHECK-LABEL: @test_vld1q_dup_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK-NEXT:    [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
+// CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
+// CHECK-NEXT:    ret <8 x i16> [[LANE]]
+//
 int16x8_t test_vld1q_dup_s16(int16_t  *a) {
   return vld1q_dup_s16(a);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x i32> @test_vld1q_dup_s32(i32* %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
-// CHECK:   [[TMP2:%.*]] = load i32, i32* [[TMP1]]
-// CHECK:   [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer
-// CHECK:   ret <4 x i32> [[LANE]]
+// CHECK-LABEL: @test_vld1q_dup_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2]], i32 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    ret <4 x i32> [[LANE]]
+//
 int32x4_t test_vld1q_dup_s32(int32_t  *a) {
   return vld1q_dup_s32(a);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i64> @test_vld1q_dup_s64(i64* %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
-// CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]]
-// CHECK:   [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer
-// CHECK:   ret <2 x i64> [[LANE]]
+// CHECK-LABEL: @test_vld1q_dup_s64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* [[TMP1]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i32 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    ret <2 x i64> [[LANE]]
+//
 int64x2_t test_vld1q_dup_s64(int64_t  *a) {
   return vld1q_dup_s64(a);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x half> @test_vld1q_dup_f16(half* %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to half*
-// CHECK:   [[TMP2:%.*]] = load half, half* [[TMP1]]
-// CHECK:   [[TMP3:%.*]] = insertelement <8 x half> undef, half [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <8 x half> [[TMP3]], <8 x half> [[TMP3]], <8 x i32> zeroinitializer
-// CHECK:   ret <8 x half> [[LANE]]
+// CHECK-LABEL: @test_vld1q_dup_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast half* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to half*
+// CHECK-NEXT:    [[TMP2:%.*]] = load half, half* [[TMP1]], align 2
+// CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x half> undef, half [[TMP2]], i32 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[TMP3]], <8 x half> [[TMP3]], <8 x i32> zeroinitializer
+// CHECK-NEXT:    ret <8 x half> [[LANE]]
+//
 float16x8_t test_vld1q_dup_f16(float16_t  *a) {
   return vld1q_dup_f16(a);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x float> @test_vld1q_dup_f32(float* %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to float*
-// CHECK:   [[TMP2:%.*]] = load float, float* [[TMP1]]
-// CHECK:   [[TMP3:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP3]], <4 x i32> zeroinitializer
-// CHECK:   ret <4 x float> [[LANE]]
+// CHECK-LABEL: @test_vld1q_dup_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to float*
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[TMP1]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    ret <4 x float> [[LANE]]
+//
 float32x4_t test_vld1q_dup_f32(float32_t  *a) {
   return vld1q_dup_f32(a);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x double> @test_vld1q_dup_f64(double* %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast double* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to double*
-// CHECK:   [[TMP2:%.*]] = load double, double* [[TMP1]]
-// CHECK:   [[TMP3:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP3]], <2 x i32> zeroinitializer
-// CHECK:   ret <2 x double> [[LANE]]
+// CHECK-LABEL: @test_vld1q_dup_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to double*
+// CHECK-NEXT:    [[TMP2:%.*]] = load double, double* [[TMP1]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    ret <2 x double> [[LANE]]
+//
 float64x2_t test_vld1q_dup_f64(float64_t  *a) {
   return vld1q_dup_f64(a);
 }
 
-// CHECK-LABEL: define{{.*}} <16 x i8> @test_vld1q_dup_p8(i8* %a) #0 {
-// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
-// CHECK:   [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
-// CHECK:   ret <16 x i8> [[LANE]]
+// CHECK-LABEL: @test_vld1q_dup_p8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[A:%.*]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
+// CHECK-NEXT:    ret <16 x i8> [[LANE]]
+//
 poly8x16_t test_vld1q_dup_p8(poly8_t  *a) {
   return vld1q_dup_p8(a);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i16> @test_vld1q_dup_p16(i16* %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
-// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
-// CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
-// CHECK:   ret <8 x i16> [[LANE]]
+// CHECK-LABEL: @test_vld1q_dup_p16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK-NEXT:    [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
+// CHECK-NEXT:    [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
+// CHECK-NEXT:    ret <8 x i16> [[LANE]]
+//
 poly16x8_t test_vld1q_dup_p16(poly16_t  *a) {
   return vld1q_dup_p16(a);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i64> @test_vld1q_dup_p64(i64* %a) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
-// CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]]
-// CHECK:   [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer
-// CHECK:   ret <2 x i64> [[LANE]]
+// CHECK-LABEL: @test_vld1q_dup_p64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* [[TMP1]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i32 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    ret <2 x i64> [[LANE]]
+//
 poly64x2_t test_vld1q_dup_p64(poly64_t  *a) {
   return vld1q_dup_p64(a);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vld1_dup_u8(i8* %a) #1 {
-// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
-// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
-// CHECK:   ret <8 x i8> [[LANE]]
+// CHECK-LABEL: @test_vld1_dup_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[A:%.*]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK-NEXT:    ret <8 x i8> [[LANE]]
+//
 uint8x8_t test_vld1_dup_u8(uint8_t  *a) {
   return vld1_dup_u8(a);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x i16> @test_vld1_dup_u16(i16* %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
-// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
-// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
-// CHECK:   ret <4 x i16> [[LANE]]
+// CHECK-LABEL: @test_vld1_dup_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK-NEXT:    [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
+// CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    ret <4 x i16> [[LANE]]
+//
 uint16x4_t test_vld1_dup_u16(uint16_t  *a) {
   return vld1_dup_u16(a);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i32> @test_vld1_dup_u32(i32* %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
-// CHECK:   [[TMP2:%.*]] = load i32, i32* [[TMP1]]
-// CHECK:   [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer
-// CHECK:   ret <2 x i32> [[LANE]]
+// CHECK-LABEL: @test_vld1_dup_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    ret <2 x i32> [[LANE]]
+//
 uint32x2_t test_vld1_dup_u32(uint32_t  *a) {
   return vld1_dup_u32(a);
 }
 
-// CHECK-LABEL: define{{.*}} <1 x i64> @test_vld1_dup_u64(i64* %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
-// CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]]
-// CHECK:   [[TMP3:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
-// CHECK:   ret <1 x i64> [[LANE]]
+// CHECK-LABEL: @test_vld1_dup_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* [[TMP1]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK-NEXT:    ret <1 x i64> [[LANE]]
+//
 uint64x1_t test_vld1_dup_u64(uint64_t  *a) {
   return vld1_dup_u64(a);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vld1_dup_s8(i8* %a) #1 {
-// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
-// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
-// CHECK:   ret <8 x i8> [[LANE]]
+// CHECK-LABEL: @test_vld1_dup_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[A:%.*]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK-NEXT:    ret <8 x i8> [[LANE]]
+//
 int8x8_t test_vld1_dup_s8(int8_t  *a) {
   return vld1_dup_s8(a);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x i16> @test_vld1_dup_s16(i16* %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
-// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
-// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
-// CHECK:   ret <4 x i16> [[LANE]]
+// CHECK-LABEL: @test_vld1_dup_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK-NEXT:    [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
+// CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    ret <4 x i16> [[LANE]]
+//
 int16x4_t test_vld1_dup_s16(int16_t  *a) {
   return vld1_dup_s16(a);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i32> @test_vld1_dup_s32(i32* %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
-// CHECK:   [[TMP2:%.*]] = load i32, i32* [[TMP1]]
-// CHECK:   [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer
-// CHECK:   ret <2 x i32> [[LANE]]
+// CHECK-LABEL: @test_vld1_dup_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    ret <2 x i32> [[LANE]]
+//
 int32x2_t test_vld1_dup_s32(int32_t  *a) {
   return vld1_dup_s32(a);
 }
 
-// CHECK-LABEL: define{{.*}} <1 x i64> @test_vld1_dup_s64(i64* %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
-// CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]]
-// CHECK:   [[TMP3:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
-// CHECK:   ret <1 x i64> [[LANE]]
+// CHECK-LABEL: @test_vld1_dup_s64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* [[TMP1]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK-NEXT:    ret <1 x i64> [[LANE]]
+//
 int64x1_t test_vld1_dup_s64(int64_t  *a) {
   return vld1_dup_s64(a);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x half> @test_vld1_dup_f16(half* %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to half*
-// CHECK:   [[TMP2:%.*]] = load half, half* [[TMP1]]
-// CHECK:   [[TMP3:%.*]] = insertelement <4 x half> undef, half [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <4 x i32> zeroinitializer
-// CHECK:   ret <4 x half> [[LANE]]
+// CHECK-LABEL: @test_vld1_dup_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast half* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to half*
+// CHECK-NEXT:    [[TMP2:%.*]] = load half, half* [[TMP1]], align 2
+// CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x half> undef, half [[TMP2]], i32 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    ret <4 x half> [[LANE]]
+//
 float16x4_t test_vld1_dup_f16(float16_t  *a) {
   return vld1_dup_f16(a);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x float> @test_vld1_dup_f32(float* %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to float*
-// CHECK:   [[TMP2:%.*]] = load float, float* [[TMP1]]
-// CHECK:   [[TMP3:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
-// CHECK:   ret <2 x float> [[LANE]]
+// CHECK-LABEL: @test_vld1_dup_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to float*
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[TMP1]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i32 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK-NEXT:    ret <2 x float> [[LANE]]
+//
 float32x2_t test_vld1_dup_f32(float32_t  *a) {
   return vld1_dup_f32(a);
 }
 
-// CHECK-LABEL: define{{.*}} <1 x double> @test_vld1_dup_f64(double* %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast double* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to double*
-// CHECK:   [[TMP2:%.*]] = load double, double* [[TMP1]]
-// CHECK:   [[TMP3:%.*]] = insertelement <1 x double> undef, double [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
-// CHECK:   ret <1 x double> [[LANE]]
+// CHECK-LABEL: @test_vld1_dup_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to double*
+// CHECK-NEXT:    [[TMP2:%.*]] = load double, double* [[TMP1]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = insertelement <1 x double> undef, double [[TMP2]], i32 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK-NEXT:    ret <1 x double> [[LANE]]
+//
 float64x1_t test_vld1_dup_f64(float64_t  *a) {
   return vld1_dup_f64(a);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vld1_dup_p8(i8* %a) #1 {
-// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
-// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
-// CHECK:   ret <8 x i8> [[LANE]]
+// CHECK-LABEL: @test_vld1_dup_p8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[A:%.*]], align 1
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK-NEXT:    ret <8 x i8> [[LANE]]
+//
 poly8x8_t test_vld1_dup_p8(poly8_t  *a) {
   return vld1_dup_p8(a);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x i16> @test_vld1_dup_p16(i16* %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
-// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
-// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
-// CHECK:   ret <4 x i16> [[LANE]]
+// CHECK-LABEL: @test_vld1_dup_p16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK-NEXT:    [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
+// CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK-NEXT:    ret <4 x i16> [[LANE]]
+//
 poly16x4_t test_vld1_dup_p16(poly16_t  *a) {
   return vld1_dup_p16(a);
 }
 
-// CHECK-LABEL: define{{.*}} <1 x i64> @test_vld1_dup_p64(i64* %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
-// CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]]
-// CHECK:   [[TMP3:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
-// CHECK:   ret <1 x i64> [[LANE]]
+// CHECK-LABEL: @test_vld1_dup_p64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK-NEXT:    [[TMP2:%.*]] = load i64, i64* [[TMP1]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK-NEXT:    ret <1 x i64> [[LANE]]
+//
 poly64x1_t test_vld1_dup_p64(poly64_t  *a) {
   return vld1_dup_p64(a);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.uint64x2x2_t @test_vld2q_dup_u64(i64* %a) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
-// CHECK:   [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0i64(i64* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }*
-// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.uint64x2x2_t [[TMP6]]
+// CHECK-LABEL: @test_vld2q_dup_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT64X2X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT64X2X2_T]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0i64(i64* [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }*
+// CHECK-NEXT:    store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast %struct.uint64x2x2_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = load [[STRUCT_UINT64X2X2_T]], %struct.uint64x2x2_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_UINT64X2X2_T]] [[TMP6]]
+//
 uint64x2x2_t test_vld2q_dup_u64(uint64_t  *a) {
   return vld2q_dup_u64(a);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.int64x2x2_t @test_vld2q_dup_s64(i64* %a) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
-// CHECK:   [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0i64(i64* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }*
-// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int64x2x2_t, %struct.int64x2x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.int64x2x2_t [[TMP6]]
+// CHECK-LABEL: @test_vld2q_dup_s64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT64X2X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT64X2X2_T]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0i64(i64* [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }*
+// CHECK-NEXT:    store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast %struct.int64x2x2_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = load [[STRUCT_INT64X2X2_T]], %struct.int64x2x2_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_INT64X2X2_T]] [[TMP6]]
+//
 int64x2x2_t test_vld2q_dup_s64(int64_t  *a) {
   return vld2q_dup_s64(a);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.float64x2x2_t @test_vld2q_dup_f64(double* %a) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
-// CHECK:   [[VLD2:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2r.v2f64.p0f64(double* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x double>, <2 x double> }*
-// CHECK:   store { <2 x double>, <2 x double> } [[VLD2]], { <2 x double>, <2 x double> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float64x2x2_t, %struct.float64x2x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.float64x2x2_t [[TMP6]]
+// CHECK-LABEL: @test_vld2q_dup_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2r.v2f64.p0f64(double* [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x double>, <2 x double> }*
+// CHECK-NEXT:    store { <2 x double>, <2 x double> } [[VLD2]], { <2 x double>, <2 x double> }* [[TMP3]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast %struct.float64x2x2_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = load [[STRUCT_FLOAT64X2X2_T]], %struct.float64x2x2_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_FLOAT64X2X2_T]] [[TMP6]]
+//
 float64x2x2_t test_vld2q_dup_f64(float64_t  *a) {
   return vld2q_dup_f64(a);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.poly64x2x2_t @test_vld2q_dup_p64(i64* %a) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x2_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
-// CHECK:   [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0i64(i64* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }*
-// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.poly64x2x2_t [[TMP6]]
+// CHECK-LABEL: @test_vld2q_dup_p64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X2X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY64X2X2_T]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0i64(i64* [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }*
+// CHECK-NEXT:    store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast %struct.poly64x2x2_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = load [[STRUCT_POLY64X2X2_T]], %struct.poly64x2x2_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_POLY64X2X2_T]] [[TMP6]]
+//
 poly64x2x2_t test_vld2q_dup_p64(poly64_t  *a) {
   return vld2q_dup_p64(a);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.float64x1x2_t @test_vld2_dup_f64(double* %a) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
-// CHECK:   [[VLD2:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2r.v1f64.p0f64(double* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x double>, <1 x double> }*
-// CHECK:   store { <1 x double>, <1 x double> } [[VLD2]], { <1 x double>, <1 x double> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float64x1x2_t, %struct.float64x1x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.float64x1x2_t [[TMP6]]
+// CHECK-LABEL: @test_vld2_dup_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2r.v1f64.p0f64(double* [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x double>, <1 x double> }*
+// CHECK-NEXT:    store { <1 x double>, <1 x double> } [[VLD2]], { <1 x double>, <1 x double> }* [[TMP3]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast %struct.float64x1x2_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = load [[STRUCT_FLOAT64X1X2_T]], %struct.float64x1x2_t* [[RETVAL]], align 8
+// CHECK-NEXT:    ret [[STRUCT_FLOAT64X1X2_T]] [[TMP6]]
+//
 float64x1x2_t test_vld2_dup_f64(float64_t  *a) {
   return vld2_dup_f64(a);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.poly64x1x2_t @test_vld2_dup_p64(i64* %a) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x2_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
-// CHECK:   [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
-// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.poly64x1x2_t [[TMP6]]
+// CHECK-LABEL: @test_vld2_dup_p64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X1X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY64X1X2_T]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK-NEXT:    [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
+// CHECK-NEXT:    store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast %struct.poly64x1x2_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = load [[STRUCT_POLY64X1X2_T]], %struct.poly64x1x2_t* [[RETVAL]], align 8
+// CHECK-NEXT:    ret [[STRUCT_POLY64X1X2_T]] [[TMP6]]
+//
 poly64x1x2_t test_vld2_dup_p64(poly64_t  *a) {
   return vld2_dup_p64(a);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.uint64x2x3_t @test_vld3q_dup_u64(i64* %a) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x3_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
-// CHECK:   [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0i64(i64* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
-// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.uint64x2x3_t [[TMP6]]
+// CHECK-LABEL: @test_vld3q_dup_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT64X2X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT64X2X3_T]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0i64(i64* [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast %struct.uint64x2x3_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = load [[STRUCT_UINT64X2X3_T]], %struct.uint64x2x3_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_UINT64X2X3_T]] [[TMP6]]
+//
 uint64x2x3_t test_vld3q_dup_u64(uint64_t  *a) {
   return vld3q_dup_u64(a);
   // [{{x[0-9]+|sp}}]
 }
 
-// CHECK-LABEL: define{{.*}} %struct.int64x2x3_t @test_vld3q_dup_s64(i64* %a) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x3_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
-// CHECK:   [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0i64(i64* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
-// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int64x2x3_t, %struct.int64x2x3_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.int64x2x3_t [[TMP6]]
+// CHECK-LABEL: @test_vld3q_dup_s64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT64X2X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT64X2X3_T]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0i64(i64* [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast %struct.int64x2x3_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = load [[STRUCT_INT64X2X3_T]], %struct.int64x2x3_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_INT64X2X3_T]] [[TMP6]]
+//
 int64x2x3_t test_vld3q_dup_s64(int64_t  *a) {
   return vld3q_dup_s64(a);
   // [{{x[0-9]+|sp}}]
 }
 
-// CHECK-LABEL: define{{.*}} %struct.float64x2x3_t @test_vld3q_dup_f64(double* %a) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x3_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
-// CHECK:   [[VLD3:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3r.v2f64.p0f64(double* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x double>, <2 x double>, <2 x double> }*
-// CHECK:   store { <2 x double>, <2 x double>, <2 x double> } [[VLD3]], { <2 x double>, <2 x double>, <2 x double> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float64x2x3_t, %struct.float64x2x3_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.float64x2x3_t [[TMP6]]
+// CHECK-LABEL: @test_vld3q_dup_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3r.v2f64.p0f64(double* [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x double>, <2 x double>, <2 x double> }*
+// CHECK-NEXT:    store { <2 x double>, <2 x double>, <2 x double> } [[VLD3]], { <2 x double>, <2 x double>, <2 x double> }* [[TMP3]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast %struct.float64x2x3_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = load [[STRUCT_FLOAT64X2X3_T]], %struct.float64x2x3_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_FLOAT64X2X3_T]] [[TMP6]]
+//
 float64x2x3_t test_vld3q_dup_f64(float64_t  *a) {
   return vld3q_dup_f64(a);
   // [{{x[0-9]+|sp}}]
 }
 
-// CHECK-LABEL: define{{.*}} %struct.poly64x2x3_t @test_vld3q_dup_p64(i64* %a) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x3_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
-// CHECK:   [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0i64(i64* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
-// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.poly64x2x3_t [[TMP6]]
+// CHECK-LABEL: @test_vld3q_dup_p64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X2X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY64X2X3_T]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0i64(i64* [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast %struct.poly64x2x3_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = load [[STRUCT_POLY64X2X3_T]], %struct.poly64x2x3_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_POLY64X2X3_T]] [[TMP6]]
+//
 poly64x2x3_t test_vld3q_dup_p64(poly64_t  *a) {
   return vld3q_dup_p64(a);
   // [{{x[0-9]+|sp}}]
 }
 
-// CHECK-LABEL: define{{.*}} %struct.float64x1x3_t @test_vld3_dup_f64(double* %a) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x3_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
-// CHECK:   [[VLD3:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3r.v1f64.p0f64(double* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x double>, <1 x double>, <1 x double> }*
-// CHECK:   store { <1 x double>, <1 x double>, <1 x double> } [[VLD3]], { <1 x double>, <1 x double>, <1 x double> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float64x1x3_t, %struct.float64x1x3_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.float64x1x3_t [[TMP6]]
+// CHECK-LABEL: @test_vld3_dup_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3r.v1f64.p0f64(double* [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x double>, <1 x double>, <1 x double> }*
+// CHECK-NEXT:    store { <1 x double>, <1 x double>, <1 x double> } [[VLD3]], { <1 x double>, <1 x double>, <1 x double> }* [[TMP3]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast %struct.float64x1x3_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = load [[STRUCT_FLOAT64X1X3_T]], %struct.float64x1x3_t* [[RETVAL]], align 8
+// CHECK-NEXT:    ret [[STRUCT_FLOAT64X1X3_T]] [[TMP6]]
+//
 float64x1x3_t test_vld3_dup_f64(float64_t  *a) {
   return vld3_dup_f64(a);
   // [{{x[0-9]+|sp}}]
 }
 
-// CHECK-LABEL: define{{.*}} %struct.poly64x1x3_t @test_vld3_dup_p64(i64* %a) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x3_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
-// CHECK:   [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
-// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.poly64x1x3_t [[TMP6]]
+// CHECK-LABEL: @test_vld3_dup_p64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X1X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY64X1X3_T]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK-NEXT:    [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast %struct.poly64x1x3_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = load [[STRUCT_POLY64X1X3_T]], %struct.poly64x1x3_t* [[RETVAL]], align 8
+// CHECK-NEXT:    ret [[STRUCT_POLY64X1X3_T]] [[TMP6]]
+//
 poly64x1x3_t test_vld3_dup_p64(poly64_t  *a) {
   return vld3_dup_p64(a);
   // [{{x[0-9]+|sp}}]
 }
 
-// CHECK-LABEL: define{{.*}} %struct.uint64x2x4_t @test_vld4q_dup_u64(i64* %a) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x4_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
-// CHECK:   [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0i64(i64* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
-// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.uint64x2x4_t [[TMP6]]
+// CHECK-LABEL: @test_vld4q_dup_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT64X2X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT64X2X4_T]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0i64(i64* [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast %struct.uint64x2x4_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = load [[STRUCT_UINT64X2X4_T]], %struct.uint64x2x4_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_UINT64X2X4_T]] [[TMP6]]
+//
 uint64x2x4_t test_vld4q_dup_u64(uint64_t  *a) {
   return vld4q_dup_u64(a);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.int64x2x4_t @test_vld4q_dup_s64(i64* %a) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x4_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
-// CHECK:   [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0i64(i64* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
-// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.int64x2x4_t, %struct.int64x2x4_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.int64x2x4_t [[TMP6]]
+// CHECK-LABEL: @test_vld4q_dup_s64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT64X2X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT64X2X4_T]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0i64(i64* [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast %struct.int64x2x4_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = load [[STRUCT_INT64X2X4_T]], %struct.int64x2x4_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_INT64X2X4_T]] [[TMP6]]
+//
 int64x2x4_t test_vld4q_dup_s64(int64_t  *a) {
   return vld4q_dup_s64(a);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.float64x2x4_t @test_vld4q_dup_f64(double* %a) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x4_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
-// CHECK:   [[VLD4:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4r.v2f64.p0f64(double* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x double>, <2 x double>, <2 x double>, <2 x double> }*
-// CHECK:   store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4]], { <2 x double>, <2 x double>, <2 x double>, <2 x double> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float64x2x4_t, %struct.float64x2x4_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.float64x2x4_t [[TMP6]]
+// CHECK-LABEL: @test_vld4q_dup_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4r.v2f64.p0f64(double* [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x double>, <2 x double>, <2 x double>, <2 x double> }*
+// CHECK-NEXT:    store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4]], { <2 x double>, <2 x double>, <2 x double>, <2 x double> }* [[TMP3]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast %struct.float64x2x4_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = load [[STRUCT_FLOAT64X2X4_T]], %struct.float64x2x4_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_FLOAT64X2X4_T]] [[TMP6]]
+//
 float64x2x4_t test_vld4q_dup_f64(float64_t  *a) {
   return vld4q_dup_f64(a);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.poly64x2x4_t @test_vld4q_dup_p64(i64* %a) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x4_t, align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
-// CHECK:   [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0i64(i64* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
-// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.poly64x2x4_t [[TMP6]]
+// CHECK-LABEL: @test_vld4q_dup_p64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X2X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY64X2X4_T]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0i64(i64* [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast %struct.poly64x2x4_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP4]], i8* align 16 [[TMP5]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = load [[STRUCT_POLY64X2X4_T]], %struct.poly64x2x4_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_POLY64X2X4_T]] [[TMP6]]
+//
 poly64x2x4_t test_vld4q_dup_p64(poly64_t  *a) {
   return vld4q_dup_p64(a);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.float64x1x4_t @test_vld4_dup_f64(double* %a) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x4_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
-// CHECK:   [[VLD4:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4r.v1f64.p0f64(double* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x double>, <1 x double>, <1 x double>, <1 x double> }*
-// CHECK:   store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4]], { <1 x double>, <1 x double>, <1 x double>, <1 x double> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.float64x1x4_t, %struct.float64x1x4_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.float64x1x4_t [[TMP6]]
+// CHECK-LABEL: @test_vld4_dup_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4r.v1f64.p0f64(double* [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x double>, <1 x double>, <1 x double>, <1 x double> }*
+// CHECK-NEXT:    store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4]], { <1 x double>, <1 x double>, <1 x double>, <1 x double> }* [[TMP3]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast %struct.float64x1x4_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = load [[STRUCT_FLOAT64X1X4_T]], %struct.float64x1x4_t* [[RETVAL]], align 8
+// CHECK-NEXT:    ret [[STRUCT_FLOAT64X1X4_T]] [[TMP6]]
+//
 float64x1x4_t test_vld4_dup_f64(float64_t  *a) {
   return vld4_dup_f64(a);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.poly64x1x4_t @test_vld4_dup_p64(i64* %a) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x4_t, align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
-// CHECK:   [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64* [[TMP2]])
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
-// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
-// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
-// CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.poly64x1x4_t [[TMP6]]
+// CHECK-LABEL: @test_vld4_dup_p64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X1X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY64X1X4_T]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK-NEXT:    [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast %struct.poly64x1x4_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP4]], i8* align 8 [[TMP5]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = load [[STRUCT_POLY64X1X4_T]], %struct.poly64x1x4_t* [[RETVAL]], align 8
+// CHECK-NEXT:    ret [[STRUCT_POLY64X1X4_T]] [[TMP6]]
+//
 poly64x1x4_t test_vld4_dup_p64(poly64_t  *a) {
   return vld4_dup_p64(a);
 }
 
-// CHECK-LABEL: define{{.*}} <16 x i8> @test_vld1q_lane_u8(i8* %a, <16 x i8> %b) #0 {
-// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
-// CHECK:   ret <16 x i8> [[VLD1_LANE]]
+// CHECK-LABEL: @test_vld1q_lane_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[A:%.*]], align 1
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <16 x i8> [[B:%.*]], i8 [[TMP0]], i32 15
+// CHECK-NEXT:    ret <16 x i8> [[VLD1_LANE]]
+//
 uint8x16_t test_vld1q_lane_u8(uint8_t  *a, uint8x16_t b) {
   return vld1q_lane_u8(a, b, 15);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i16> @test_vld1q_lane_u16(i16* %a, <8 x i16> %b) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
-// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
-// CHECK:   ret <8 x i16> [[VLD1_LANE]]
+// CHECK-LABEL: @test_vld1q_lane_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK-NEXT:    [[TMP4:%.*]] = load i16, i16* [[TMP3]], align 2
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
+// CHECK-NEXT:    ret <8 x i16> [[VLD1_LANE]]
+//
 uint16x8_t test_vld1q_lane_u16(uint16_t  *a, uint16x8_t b) {
   return vld1q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x i32> @test_vld1q_lane_u32(i32* %a, <4 x i32> %b) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32*
-// CHECK:   [[TMP4:%.*]] = load i32, i32* [[TMP3]]
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i32 3
-// CHECK:   ret <4 x i32> [[VLD1_LANE]]
+// CHECK-LABEL: @test_vld1q_lane_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i32 3
+// CHECK-NEXT:    ret <4 x i32> [[VLD1_LANE]]
+//
 uint32x4_t test_vld1q_lane_u32(uint32_t  *a, uint32x4_t b) {
   return vld1q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i64> @test_vld1q_lane_u64(i64* %a, <2 x i64> %b) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64*
-// CHECK:   [[TMP4:%.*]] = load i64, i64* [[TMP3]]
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP4]], i32 1
-// CHECK:   ret <2 x i64> [[VLD1_LANE]]
+// CHECK-LABEL: @test_vld1q_lane_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK-NEXT:    [[TMP4:%.*]] = load i64, i64* [[TMP3]], align 8
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP4]], i32 1
+// CHECK-NEXT:    ret <2 x i64> [[VLD1_LANE]]
+//
 uint64x2_t test_vld1q_lane_u64(uint64_t  *a, uint64x2_t b) {
   return vld1q_lane_u64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} <16 x i8> @test_vld1q_lane_s8(i8* %a, <16 x i8> %b) #0 {
-// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
-// CHECK:   ret <16 x i8> [[VLD1_LANE]]
+// CHECK-LABEL: @test_vld1q_lane_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[A:%.*]], align 1
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <16 x i8> [[B:%.*]], i8 [[TMP0]], i32 15
+// CHECK-NEXT:    ret <16 x i8> [[VLD1_LANE]]
+//
 int8x16_t test_vld1q_lane_s8(int8_t  *a, int8x16_t b) {
   return vld1q_lane_s8(a, b, 15);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i16> @test_vld1q_lane_s16(i16* %a, <8 x i16> %b) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
-// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
-// CHECK:   ret <8 x i16> [[VLD1_LANE]]
+// CHECK-LABEL: @test_vld1q_lane_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK-NEXT:    [[TMP4:%.*]] = load i16, i16* [[TMP3]], align 2
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
+// CHECK-NEXT:    ret <8 x i16> [[VLD1_LANE]]
+//
 int16x8_t test_vld1q_lane_s16(int16_t  *a, int16x8_t b) {
   return vld1q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x i32> @test_vld1q_lane_s32(i32* %a, <4 x i32> %b) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32*
-// CHECK:   [[TMP4:%.*]] = load i32, i32* [[TMP3]]
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i32 3
-// CHECK:   ret <4 x i32> [[VLD1_LANE]]
+// CHECK-LABEL: @test_vld1q_lane_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i32 3
+// CHECK-NEXT:    ret <4 x i32> [[VLD1_LANE]]
+//
 int32x4_t test_vld1q_lane_s32(int32_t  *a, int32x4_t b) {
   return vld1q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i64> @test_vld1q_lane_s64(i64* %a, <2 x i64> %b) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64*
-// CHECK:   [[TMP4:%.*]] = load i64, i64* [[TMP3]]
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP4]], i32 1
-// CHECK:   ret <2 x i64> [[VLD1_LANE]]
+// CHECK-LABEL: @test_vld1q_lane_s64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK-NEXT:    [[TMP4:%.*]] = load i64, i64* [[TMP3]], align 8
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP4]], i32 1
+// CHECK-NEXT:    ret <2 x i64> [[VLD1_LANE]]
+//
 int64x2_t test_vld1q_lane_s64(int64_t  *a, int64x2_t b) {
   return vld1q_lane_s64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x half> @test_vld1q_lane_f16(half* %a, <8 x half> %b) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to half*
-// CHECK:   [[TMP4:%.*]] = load half, half* [[TMP3]]
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x half> [[TMP2]], half [[TMP4]], i32 7
-// CHECK:   ret <8 x half> [[VLD1_LANE]]
+// CHECK-LABEL: @test_vld1q_lane_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast half* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP0]] to half*
+// CHECK-NEXT:    [[TMP4:%.*]] = load half, half* [[TMP3]], align 2
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <8 x half> [[TMP2]], half [[TMP4]], i32 7
+// CHECK-NEXT:    ret <8 x half> [[VLD1_LANE]]
+//
 float16x8_t test_vld1q_lane_f16(float16_t  *a, float16x8_t b) {
   return vld1q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x float> @test_vld1q_lane_f32(float* %a, <4 x float> %b) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to float*
-// CHECK:   [[TMP4:%.*]] = load float, float* [[TMP3]]
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP4]], i32 3
-// CHECK:   ret <4 x float> [[VLD1_LANE]]
+// CHECK-LABEL: @test_vld1q_lane_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP0]] to float*
+// CHECK-NEXT:    [[TMP4:%.*]] = load float, float* [[TMP3]], align 4
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP4]], i32 3
+// CHECK-NEXT:    ret <4 x float> [[VLD1_LANE]]
+//
 float32x4_t test_vld1q_lane_f32(float32_t  *a, float32x4_t b) {
   return vld1q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x double> @test_vld1q_lane_f64(double* %a, <2 x double> %b) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast double* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to double*
-// CHECK:   [[TMP4:%.*]] = load double, double* [[TMP3]]
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP4]], i32 1
-// CHECK:   ret <2 x double> [[VLD1_LANE]]
+// CHECK-LABEL: @test_vld1q_lane_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[B:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP0]] to double*
+// CHECK-NEXT:    [[TMP4:%.*]] = load double, double* [[TMP3]], align 8
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP4]], i32 1
+// CHECK-NEXT:    ret <2 x double> [[VLD1_LANE]]
+//
 float64x2_t test_vld1q_lane_f64(float64_t  *a, float64x2_t b) {
   return vld1q_lane_f64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} <16 x i8> @test_vld1q_lane_p8(i8* %a, <16 x i8> %b) #0 {
-// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
-// CHECK:   ret <16 x i8> [[VLD1_LANE]]
+// CHECK-LABEL: @test_vld1q_lane_p8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[A:%.*]], align 1
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <16 x i8> [[B:%.*]], i8 [[TMP0]], i32 15
+// CHECK-NEXT:    ret <16 x i8> [[VLD1_LANE]]
+//
 poly8x16_t test_vld1q_lane_p8(poly8_t  *a, poly8x16_t b) {
   return vld1q_lane_p8(a, b, 15);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i16> @test_vld1q_lane_p16(i16* %a, <8 x i16> %b) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
-// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
-// CHECK:   ret <8 x i16> [[VLD1_LANE]]
+// CHECK-LABEL: @test_vld1q_lane_p16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK-NEXT:    [[TMP4:%.*]] = load i16, i16* [[TMP3]], align 2
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
+// CHECK-NEXT:    ret <8 x i16> [[VLD1_LANE]]
+//
 poly16x8_t test_vld1q_lane_p16(poly16_t  *a, poly16x8_t b) {
   return vld1q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i64> @test_vld1q_lane_p64(i64* %a, <2 x i64> %b) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64*
-// CHECK:   [[TMP4:%.*]] = load i64, i64* [[TMP3]]
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP4]], i32 1
-// CHECK:   ret <2 x i64> [[VLD1_LANE]]
+// CHECK-LABEL: @test_vld1q_lane_p64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK-NEXT:    [[TMP4:%.*]] = load i64, i64* [[TMP3]], align 8
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP4]], i32 1
+// CHECK-NEXT:    ret <2 x i64> [[VLD1_LANE]]
+//
 poly64x2_t test_vld1q_lane_p64(poly64_t  *a, poly64x2_t b) {
   return vld1q_lane_p64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vld1_lane_u8(i8* %a, <8 x i8> %b) #1 {
-// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
-// CHECK:   ret <8 x i8> [[VLD1_LANE]]
+// CHECK-LABEL: @test_vld1_lane_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[A:%.*]], align 1
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <8 x i8> [[B:%.*]], i8 [[TMP0]], i32 7
+// CHECK-NEXT:    ret <8 x i8> [[VLD1_LANE]]
+//
 uint8x8_t test_vld1_lane_u8(uint8_t  *a, uint8x8_t b) {
   return vld1_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x i16> @test_vld1_lane_u16(i16* %a, <4 x i16> %b) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
-// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
-// CHECK:   ret <4 x i16> [[VLD1_LANE]]
+// CHECK-LABEL: @test_vld1_lane_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK-NEXT:    [[TMP4:%.*]] = load i16, i16* [[TMP3]], align 2
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
+// CHECK-NEXT:    ret <4 x i16> [[VLD1_LANE]]
+//
 uint16x4_t test_vld1_lane_u16(uint16_t  *a, uint16x4_t b) {
   return vld1_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i32> @test_vld1_lane_u32(i32* %a, <2 x i32> %b) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32*
-// CHECK:   [[TMP4:%.*]] = load i32, i32* [[TMP3]]
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i32 1
-// CHECK:   ret <2 x i32> [[VLD1_LANE]]
+// CHECK-LABEL: @test_vld1_lane_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i32 1
+// CHECK-NEXT:    ret <2 x i32> [[VLD1_LANE]]
+//
 uint32x2_t test_vld1_lane_u32(uint32_t  *a, uint32x2_t b) {
   return vld1_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} <1 x i64> @test_vld1_lane_u64(i64* %a, <1 x i64> %b) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64*
-// CHECK:   [[TMP4:%.*]] = load i64, i64* [[TMP3]]
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0
-// CHECK:   ret <1 x i64> [[VLD1_LANE]]
+// CHECK-LABEL: @test_vld1_lane_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK-NEXT:    [[TMP4:%.*]] = load i64, i64* [[TMP3]], align 8
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0
+// CHECK-NEXT:    ret <1 x i64> [[VLD1_LANE]]
+//
 uint64x1_t test_vld1_lane_u64(uint64_t  *a, uint64x1_t b) {
   return vld1_lane_u64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vld1_lane_s8(i8* %a, <8 x i8> %b) #1 {
-// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
-// CHECK:   ret <8 x i8> [[VLD1_LANE]]
+// CHECK-LABEL: @test_vld1_lane_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[A:%.*]], align 1
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <8 x i8> [[B:%.*]], i8 [[TMP0]], i32 7
+// CHECK-NEXT:    ret <8 x i8> [[VLD1_LANE]]
+//
 int8x8_t test_vld1_lane_s8(int8_t  *a, int8x8_t b) {
   return vld1_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x i16> @test_vld1_lane_s16(i16* %a, <4 x i16> %b) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
-// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
-// CHECK:   ret <4 x i16> [[VLD1_LANE]]
+// CHECK-LABEL: @test_vld1_lane_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK-NEXT:    [[TMP4:%.*]] = load i16, i16* [[TMP3]], align 2
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
+// CHECK-NEXT:    ret <4 x i16> [[VLD1_LANE]]
+//
 int16x4_t test_vld1_lane_s16(int16_t  *a, int16x4_t b) {
   return vld1_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i32> @test_vld1_lane_s32(i32* %a, <2 x i32> %b) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32*
-// CHECK:   [[TMP4:%.*]] = load i32, i32* [[TMP3]]
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i32 1
-// CHECK:   ret <2 x i32> [[VLD1_LANE]]
+// CHECK-LABEL: @test_vld1_lane_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i32 1
+// CHECK-NEXT:    ret <2 x i32> [[VLD1_LANE]]
+//
 int32x2_t test_vld1_lane_s32(int32_t  *a, int32x2_t b) {
   return vld1_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} <1 x i64> @test_vld1_lane_s64(i64* %a, <1 x i64> %b) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64*
-// CHECK:   [[TMP4:%.*]] = load i64, i64* [[TMP3]]
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0
-// CHECK:   ret <1 x i64> [[VLD1_LANE]]
+// CHECK-LABEL: @test_vld1_lane_s64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK-NEXT:    [[TMP4:%.*]] = load i64, i64* [[TMP3]], align 8
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0
+// CHECK-NEXT:    ret <1 x i64> [[VLD1_LANE]]
+//
 int64x1_t test_vld1_lane_s64(int64_t  *a, int64x1_t b) {
   return vld1_lane_s64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x half> @test_vld1_lane_f16(half* %a, <4 x half> %b) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to half*
-// CHECK:   [[TMP4:%.*]] = load half, half* [[TMP3]]
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x half> [[TMP2]], half [[TMP4]], i32 3
-// CHECK:   ret <4 x half> [[VLD1_LANE]]
+// CHECK-LABEL: @test_vld1_lane_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast half* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP0]] to half*
+// CHECK-NEXT:    [[TMP4:%.*]] = load half, half* [[TMP3]], align 2
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <4 x half> [[TMP2]], half [[TMP4]], i32 3
+// CHECK-NEXT:    ret <4 x half> [[VLD1_LANE]]
+//
 float16x4_t test_vld1_lane_f16(float16_t  *a, float16x4_t b) {
   return vld1_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x float> @test_vld1_lane_f32(float* %a, <2 x float> %b) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to float*
-// CHECK:   [[TMP4:%.*]] = load float, float* [[TMP3]]
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP4]], i32 1
-// CHECK:   ret <2 x float> [[VLD1_LANE]]
+// CHECK-LABEL: @test_vld1_lane_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP0]] to float*
+// CHECK-NEXT:    [[TMP4:%.*]] = load float, float* [[TMP3]], align 4
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP4]], i32 1
+// CHECK-NEXT:    ret <2 x float> [[VLD1_LANE]]
+//
 float32x2_t test_vld1_lane_f32(float32_t  *a, float32x2_t b) {
   return vld1_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} <1 x double> @test_vld1_lane_f64(double* %a, <1 x double> %b) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast double* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to double*
-// CHECK:   [[TMP4:%.*]] = load double, double* [[TMP3]]
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <1 x double> [[TMP2]], double [[TMP4]], i32 0
-// CHECK:   ret <1 x double> [[VLD1_LANE]]
+// CHECK-LABEL: @test_vld1_lane_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x double> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP0]] to double*
+// CHECK-NEXT:    [[TMP4:%.*]] = load double, double* [[TMP3]], align 8
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <1 x double> [[TMP2]], double [[TMP4]], i32 0
+// CHECK-NEXT:    ret <1 x double> [[VLD1_LANE]]
+//
 float64x1_t test_vld1_lane_f64(float64_t  *a, float64x1_t b) {
   return vld1_lane_f64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i8> @test_vld1_lane_p8(i8* %a, <8 x i8> %b) #1 {
-// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
-// CHECK:   ret <8 x i8> [[VLD1_LANE]]
+// CHECK-LABEL: @test_vld1_lane_p8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[A:%.*]], align 1
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <8 x i8> [[B:%.*]], i8 [[TMP0]], i32 7
+// CHECK-NEXT:    ret <8 x i8> [[VLD1_LANE]]
+//
 poly8x8_t test_vld1_lane_p8(poly8_t  *a, poly8x8_t b) {
   return vld1_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x i16> @test_vld1_lane_p16(i16* %a, <4 x i16> %b) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
-// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
-// CHECK:   ret <4 x i16> [[VLD1_LANE]]
+// CHECK-LABEL: @test_vld1_lane_p16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK-NEXT:    [[TMP4:%.*]] = load i16, i16* [[TMP3]], align 2
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
+// CHECK-NEXT:    ret <4 x i16> [[VLD1_LANE]]
+//
 poly16x4_t test_vld1_lane_p16(poly16_t  *a, poly16x4_t b) {
   return vld1_lane_p16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} <1 x i64> @test_vld1_lane_p64(i64* %a, <1 x i64> %b) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64*
-// CHECK:   [[TMP4:%.*]] = load i64, i64* [[TMP3]]
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0
-// CHECK:   ret <1 x i64> [[VLD1_LANE]]
+// CHECK-LABEL: @test_vld1_lane_p64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK-NEXT:    [[TMP4:%.*]] = load i64, i64* [[TMP3]], align 8
+// CHECK-NEXT:    [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0
+// CHECK-NEXT:    ret <1 x i64> [[VLD1_LANE]]
+//
 poly64x1_t test_vld1_lane_p64(poly64_t  *a, poly64x1_t b) {
   return vld1_lane_p64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.int8x16x2_t @test_vld2q_lane_s8(i8* %ptr, [2 x <16 x i8>] %src.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[SRC:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[SRC]], i32 0, i32 0
-// CHECK:   store [2 x <16 x i8>] [[SRC]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x2_t* [[SRC]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
-// CHECK:   [[VLD2_LANE:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, i8* %ptr)
-// CHECK:   [[TMP5:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8> }*
-// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2_LANE]], { <16 x i8>, <16 x i8> }* [[TMP5]]
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP6]], i8* align 16 [[TMP7]], i64 32, i1 false)
-// CHECK:   [[TMP8:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.int8x16x2_t [[TMP8]]
+// CHECK-LABEL: @test_vld2q_lane_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT8X16X2_T:%.*]], align 16
+// CHECK-NEXT:    [[SRC:%.*]] = alloca [[STRUCT_INT8X16X2_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X16X2_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT8X16X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X2_T]], %struct.int8x16x2_t* [[SRC]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <16 x i8>] [[SRC_COERCE:%.*]], [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.int8x16x2_t* [[SRC]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X2_T]], %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X2_T]], %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, i8* [[PTR:%.*]])
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8> }*
+// CHECK-NEXT:    store { <16 x i8>, <16 x i8> } [[VLD2_LANE]], { <16 x i8>, <16 x i8> }* [[TMP5]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP6]], i8* align 16 [[TMP7]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP8:%.*]] = load [[STRUCT_INT8X16X2_T]], %struct.int8x16x2_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_INT8X16X2_T]] [[TMP8]]
+//
 int8x16x2_t test_vld2q_lane_s8(int8_t const * ptr, int8x16x2_t src) {
   return vld2q_lane_s8(ptr, src, 15);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.uint8x16x2_t @test_vld2q_lane_u8(i8* %ptr, [2 x <16 x i8>] %src.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[SRC:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[SRC]], i32 0, i32 0
-// CHECK:   store [2 x <16 x i8>] [[SRC]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x2_t* [[SRC]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
-// CHECK:   [[VLD2_LANE:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, i8* %ptr)
-// CHECK:   [[TMP5:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8> }*
-// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2_LANE]], { <16 x i8>, <16 x i8> }* [[TMP5]]
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP6]], i8* align 16 [[TMP7]], i64 32, i1 false)
-// CHECK:   [[TMP8:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.uint8x16x2_t [[TMP8]]
+// CHECK-LABEL: @test_vld2q_lane_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT8X16X2_T:%.*]], align 16
+// CHECK-NEXT:    [[SRC:%.*]] = alloca [[STRUCT_UINT8X16X2_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X16X2_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT8X16X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X2_T]], %struct.uint8x16x2_t* [[SRC]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <16 x i8>] [[SRC_COERCE:%.*]], [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.uint8x16x2_t* [[SRC]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X2_T]], %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X2_T]], %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, i8* [[PTR:%.*]])
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8> }*
+// CHECK-NEXT:    store { <16 x i8>, <16 x i8> } [[VLD2_LANE]], { <16 x i8>, <16 x i8> }* [[TMP5]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP6]], i8* align 16 [[TMP7]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP8:%.*]] = load [[STRUCT_UINT8X16X2_T]], %struct.uint8x16x2_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_UINT8X16X2_T]] [[TMP8]]
+//
 uint8x16x2_t test_vld2q_lane_u8(uint8_t const * ptr, uint8x16x2_t src) {
   return vld2q_lane_u8(ptr, src, 15);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.poly8x16x2_t @test_vld2q_lane_p8(i8* %ptr, [2 x <16 x i8>] %src.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[SRC:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[SRC]], i32 0, i32 0
-// CHECK:   store [2 x <16 x i8>] [[SRC]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x2_t* [[SRC]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
-// CHECK:   [[VLD2_LANE:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, i8* %ptr)
-// CHECK:   [[TMP5:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8> }*
-// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2_LANE]], { <16 x i8>, <16 x i8> }* [[TMP5]]
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP6]], i8* align 16 [[TMP7]], i64 32, i1 false)
-// CHECK:   [[TMP8:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.poly8x16x2_t [[TMP8]]
+// CHECK-LABEL: @test_vld2q_lane_p8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY8X16X2_T:%.*]], align 16
+// CHECK-NEXT:    [[SRC:%.*]] = alloca [[STRUCT_POLY8X16X2_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X16X2_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY8X16X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X2_T]], %struct.poly8x16x2_t* [[SRC]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <16 x i8>] [[SRC_COERCE:%.*]], [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.poly8x16x2_t* [[SRC]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X2_T]], %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X2_T]], %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, i8* [[PTR:%.*]])
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8> }*
+// CHECK-NEXT:    store { <16 x i8>, <16 x i8> } [[VLD2_LANE]], { <16 x i8>, <16 x i8> }* [[TMP5]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP6]], i8* align 16 [[TMP7]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP8:%.*]] = load [[STRUCT_POLY8X16X2_T]], %struct.poly8x16x2_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_POLY8X16X2_T]] [[TMP8]]
+//
 poly8x16x2_t test_vld2q_lane_p8(poly8_t const * ptr, poly8x16x2_t src) {
   return vld2q_lane_p8(ptr, src, 15);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.int8x16x3_t @test_vld3q_lane_s8(i8* %ptr, [3 x <16 x i8>] %src.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x3_t, align 16
-// CHECK:   [[SRC:%.*]] = alloca %struct.int8x16x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[SRC]], i32 0, i32 0
-// CHECK:   store [3 x <16 x i8>] [[SRC]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x3_t* [[SRC]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
-// CHECK:   [[VLD3_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, i8* %ptr)
-// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
-// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.int8x16x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP7]], i8* align 16 [[TMP8]], i64 48, i1 false)
-// CHECK:   [[TMP9:%.*]] = load %struct.int8x16x3_t, %struct.int8x16x3_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.int8x16x3_t [[TMP9]]
+// CHECK-LABEL: @test_vld3q_lane_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT8X16X3_T:%.*]], align 16
+// CHECK-NEXT:    [[SRC:%.*]] = alloca [[STRUCT_INT8X16X3_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X16X3_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT8X16X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X3_T]], %struct.int8x16x3_t* [[SRC]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <16 x i8>] [[SRC_COERCE:%.*]], [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.int8x16x3_t* [[SRC]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X3_T]], %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X3_T]], %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X3_T]], %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, i8* [[PTR:%.*]])
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP6]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast %struct.int8x16x3_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP7]], i8* align 16 [[TMP8]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP9:%.*]] = load [[STRUCT_INT8X16X3_T]], %struct.int8x16x3_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_INT8X16X3_T]] [[TMP9]]
+//
 int8x16x3_t test_vld3q_lane_s8(int8_t const * ptr, int8x16x3_t src) {
   return vld3q_lane_s8(ptr, src, 15);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.uint8x16x3_t @test_vld3q_lane_u8(i8* %ptr, [3 x <16 x i8>] %src.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x3_t, align 16
-// CHECK:   [[SRC:%.*]] = alloca %struct.uint8x16x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[SRC]], i32 0, i32 0
-// CHECK:   store [3 x <16 x i8>] [[SRC]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x3_t* [[SRC]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
-// CHECK:   [[VLD3_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, i8* %ptr)
-// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
-// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint8x16x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP7]], i8* align 16 [[TMP8]], i64 48, i1 false)
-// CHECK:   [[TMP9:%.*]] = load %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.uint8x16x3_t [[TMP9]]
+// CHECK-LABEL: @test_vld3q_lane_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT8X16X3_T:%.*]], align 16
+// CHECK-NEXT:    [[SRC:%.*]] = alloca [[STRUCT_UINT8X16X3_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X16X3_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT8X16X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X3_T]], %struct.uint8x16x3_t* [[SRC]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <16 x i8>] [[SRC_COERCE:%.*]], [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.uint8x16x3_t* [[SRC]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X3_T]], %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X3_T]], %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X3_T]], %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, i8* [[PTR:%.*]])
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP6]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast %struct.uint8x16x3_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP7]], i8* align 16 [[TMP8]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP9:%.*]] = load [[STRUCT_UINT8X16X3_T]], %struct.uint8x16x3_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_UINT8X16X3_T]] [[TMP9]]
+//
 uint8x16x3_t test_vld3q_lane_u8(uint8_t const * ptr, uint8x16x3_t src) {
   return vld3q_lane_u8(ptr, src, 15);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.uint16x8x2_t @test_vld2q_lane_u16(i16* %a, [2 x <8 x i16>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
-// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i8(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, i8* [[TMP3]])
-// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16> }*
-// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], { <8 x i16>, <8 x i16> }* [[TMP10]]
-// CHECK:   [[TMP11:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
-// CHECK:   [[TMP13:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.uint16x8x2_t [[TMP13]]
+// CHECK-LABEL: @test_vld2q_lane_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT16X8X2_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X8X2_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X8X2_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT16X8X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT16X8X2_T]], %struct.uint16x8x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <8 x i16>] [[B_COERCE:%.*]], [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT16X8X2_T]], %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT16X8X2_T]], %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i8(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16> }*
+// CHECK-NEXT:    store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], { <8 x i16>, <8 x i16> }* [[TMP10]], align 16
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP13:%.*]] = load [[STRUCT_UINT16X8X2_T]], %struct.uint16x8x2_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_UINT16X8X2_T]] [[TMP13]]
+//
 uint16x8x2_t test_vld2q_lane_u16(uint16_t  *a, uint16x8x2_t b) {
   return vld2q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.uint32x4x2_t @test_vld2q_lane_u32(i32* %a, [2 x <4 x i32>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
-// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i8(<4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i64 3, i8* [[TMP3]])
-// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32> }*
-// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2_LANE]], { <4 x i32>, <4 x i32> }* [[TMP10]]
-// CHECK:   [[TMP11:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
-// CHECK:   [[TMP13:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.uint32x4x2_t [[TMP13]]
+// CHECK-LABEL: @test_vld2q_lane_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT32X4X2_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X4X2_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X4X2_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT32X4X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT32X4X2_T]], %struct.uint32x4x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <4 x i32>] [[B_COERCE:%.*]], [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT32X4X2_T]], %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT32X4X2_T]], %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i8(<4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i64 3, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32> }*
+// CHECK-NEXT:    store { <4 x i32>, <4 x i32> } [[VLD2_LANE]], { <4 x i32>, <4 x i32> }* [[TMP10]], align 16
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP13:%.*]] = load [[STRUCT_UINT32X4X2_T]], %struct.uint32x4x2_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_UINT32X4X2_T]] [[TMP13]]
+//
 uint32x4x2_t test_vld2q_lane_u32(uint32_t  *a, uint32x4x2_t b) {
   return vld2q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.uint64x2x2_t @test_vld2q_lane_u64(i64* %a, [2 x <2 x i64>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x2_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
-// CHECK:   [[VLD2_LANE:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0i8(<2 x i64> [[TMP8]], <2 x i64> [[TMP9]], i64 1, i8* [[TMP3]])
-// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64> }*
-// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2_LANE]], { <2 x i64>, <2 x i64> }* [[TMP10]]
-// CHECK:   [[TMP11:%.*]] = bitcast %struct.uint64x2x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
-// CHECK:   [[TMP13:%.*]] = load %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.uint64x2x2_t [[TMP13]]
+// CHECK-LABEL: @test_vld2q_lane_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT64X2X2_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT64X2X2_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT64X2X2_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT64X2X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT64X2X2_T]], %struct.uint64x2x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <2 x i64>] [[B_COERCE:%.*]], [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.uint64x2x2_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT64X2X2_T]], %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT64X2X2_T]], %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0i8(<2 x i64> [[TMP8]], <2 x i64> [[TMP9]], i64 1, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64> }*
+// CHECK-NEXT:    store { <2 x i64>, <2 x i64> } [[VLD2_LANE]], { <2 x i64>, <2 x i64> }* [[TMP10]], align 16
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast %struct.uint64x2x2_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP13:%.*]] = load [[STRUCT_UINT64X2X2_T]], %struct.uint64x2x2_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_UINT64X2X2_T]] [[TMP13]]
+//
 uint64x2x2_t test_vld2q_lane_u64(uint64_t  *a, uint64x2x2_t b) {
   return vld2q_lane_u64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.int16x8x2_t @test_vld2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
-// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i8(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, i8* [[TMP3]])
-// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16> }*
-// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], { <8 x i16>, <8 x i16> }* [[TMP10]]
-// CHECK:   [[TMP11:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP12:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
-// CHECK:   [[TMP13:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.int16x8x2_t [[TMP13]]
+// CHECK-LABEL: @test_vld2q_lane_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT16X8X2_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X8X2_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X8X2_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT16X8X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X2_T]], %struct.int16x8x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <8 x i16>] [[B_COERCE:%.*]], [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X2_T]], %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X2_T]], %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i8(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16> }*
+// CHECK-NEXT:    store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], { <8 x i16>, <8 x i16> }* [[TMP10]], align 16
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP13:%.*]] = load [[STRUCT_INT16X8X2_T]], %struct.int16x8x2_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_INT16X8X2_T]] [[TMP13]]
+//
 int16x8x2_t test_vld2q_lane_s16(int16_t  *a, int16x8x2_t b) {
   return vld2q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.int32x4x2_t @test_vld2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
-// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i8(<4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i64 3, i8* [[TMP3]])
-// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32> }*
-// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2_LANE]], { <4 x i32>, <4 x i32> }* [[TMP10]]
-// CHECK:   [[TMP11:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP12:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
-// CHECK:   [[TMP13:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.int32x4x2_t [[TMP13]]
+// CHECK-LABEL: @test_vld2q_lane_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT32X4X2_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X4X2_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X4X2_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT32X4X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT32X4X2_T]], %struct.int32x4x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <4 x i32>] [[B_COERCE:%.*]], [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT32X4X2_T]], %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT32X4X2_T]], %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i8(<4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i64 3, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32> }*
+// CHECK-NEXT:    store { <4 x i32>, <4 x i32> } [[VLD2_LANE]], { <4 x i32>, <4 x i32> }* [[TMP10]], align 16
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP13:%.*]] = load [[STRUCT_INT32X4X2_T]], %struct.int32x4x2_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_INT32X4X2_T]] [[TMP13]]
+//
 int32x4x2_t test_vld2q_lane_s32(int32_t  *a, int32x4x2_t b) {
   return vld2q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.int64x2x2_t @test_vld2q_lane_s64(i64* %a, [2 x <2 x i64>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x2_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.int64x2x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
-// CHECK:   [[VLD2_LANE:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0i8(<2 x i64> [[TMP8]], <2 x i64> [[TMP9]], i64 1, i8* [[TMP3]])
-// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64> }*
-// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2_LANE]], { <2 x i64>, <2 x i64> }* [[TMP10]]
-// CHECK:   [[TMP11:%.*]] = bitcast %struct.int64x2x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP12:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
-// CHECK:   [[TMP13:%.*]] = load %struct.int64x2x2_t, %struct.int64x2x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.int64x2x2_t [[TMP13]]
+// CHECK-LABEL: @test_vld2q_lane_s64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT64X2X2_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT64X2X2_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT64X2X2_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT64X2X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT64X2X2_T]], %struct.int64x2x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <2 x i64>] [[B_COERCE:%.*]], [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.int64x2x2_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT64X2X2_T]], %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT64X2X2_T]], %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0i8(<2 x i64> [[TMP8]], <2 x i64> [[TMP9]], i64 1, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64> }*
+// CHECK-NEXT:    store { <2 x i64>, <2 x i64> } [[VLD2_LANE]], { <2 x i64>, <2 x i64> }* [[TMP10]], align 16
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast %struct.int64x2x2_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP13:%.*]] = load [[STRUCT_INT64X2X2_T]], %struct.int64x2x2_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_INT64X2X2_T]] [[TMP13]]
+//
 int64x2x2_t test_vld2q_lane_s64(int64_t  *a, int64x2x2_t b) {
   return vld2q_lane_s64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.float16x8x2_t @test_vld2q_lane_f16(half* %a, [2 x <8 x half>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <8 x half>] [[B]].coerce, [2 x <8 x half>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
-// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2lane.v8f16.p0i8(<8 x half> [[TMP8]], <8 x half> [[TMP9]], i64 7, i8* [[TMP3]])
-// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <8 x half>, <8 x half> }*
-// CHECK:   store { <8 x half>, <8 x half> } [[VLD2_LANE]], { <8 x half>, <8 x half> }* [[TMP10]]
-// CHECK:   [[TMP11:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP12:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
-// CHECK:   [[TMP13:%.*]] = load %struct.float16x8x2_t, %struct.float16x8x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.float16x8x2_t [[TMP13]]
+// CHECK-LABEL: @test_vld2q_lane_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X2_T]], %struct.float16x8x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <8 x half>] [[B_COERCE:%.*]], [2 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast half* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X2_T]], %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X2_T]], %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2lane.v8f16.p0i8(<8 x half> [[TMP8]], <8 x half> [[TMP9]], i64 7, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <8 x half>, <8 x half> }*
+// CHECK-NEXT:    store { <8 x half>, <8 x half> } [[VLD2_LANE]], { <8 x half>, <8 x half> }* [[TMP10]], align 16
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP13:%.*]] = load [[STRUCT_FLOAT16X8X2_T]], %struct.float16x8x2_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_FLOAT16X8X2_T]] [[TMP13]]
+//
 float16x8x2_t test_vld2q_lane_f16(float16_t  *a, float16x8x2_t b) {
   return vld2q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.float32x4x2_t @test_vld2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <4 x float>] [[B]].coerce, [2 x <4 x float>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
-// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2lane.v4f32.p0i8(<4 x float> [[TMP8]], <4 x float> [[TMP9]], i64 3, i8* [[TMP3]])
-// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x float>, <4 x float> }*
-// CHECK:   store { <4 x float>, <4 x float> } [[VLD2_LANE]], { <4 x float>, <4 x float> }* [[TMP10]]
-// CHECK:   [[TMP11:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP12:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
-// CHECK:   [[TMP13:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.float32x4x2_t [[TMP13]]
+// CHECK-LABEL: @test_vld2q_lane_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT32X4X2_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X4X2_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X4X2_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X4X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X4X2_T]], %struct.float32x4x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <4 x float>] [[B_COERCE:%.*]], [2 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X4X2_T]], %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X4X2_T]], %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2lane.v4f32.p0i8(<4 x float> [[TMP8]], <4 x float> [[TMP9]], i64 3, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x float>, <4 x float> }*
+// CHECK-NEXT:    store { <4 x float>, <4 x float> } [[VLD2_LANE]], { <4 x float>, <4 x float> }* [[TMP10]], align 16
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP13:%.*]] = load [[STRUCT_FLOAT32X4X2_T]], %struct.float32x4x2_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_FLOAT32X4X2_T]] [[TMP13]]
+//
 float32x4x2_t test_vld2q_lane_f32(float32_t  *a, float32x4x2_t b) {
   return vld2q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.float64x2x2_t @test_vld2q_lane_f64(double* %a, [2 x <2 x double>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x2_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.float64x2x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <2 x double>] [[B]].coerce, [2 x <2 x double>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast double* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x double> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x double> [[TMP6]] to <16 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x double>
-// CHECK:   [[VLD2_LANE:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2lane.v2f64.p0i8(<2 x double> [[TMP8]], <2 x double> [[TMP9]], i64 1, i8* [[TMP3]])
-// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x double>, <2 x double> }*
-// CHECK:   store { <2 x double>, <2 x double> } [[VLD2_LANE]], { <2 x double>, <2 x double> }* [[TMP10]]
-// CHECK:   [[TMP11:%.*]] = bitcast %struct.float64x2x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP12:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
-// CHECK:   [[TMP13:%.*]] = load %struct.float64x2x2_t, %struct.float64x2x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.float64x2x2_t [[TMP13]]
+// CHECK-LABEL: @test_vld2q_lane_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X2_T]], %struct.float64x2x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <2 x double>] [[B_COERCE:%.*]], [2 x <2 x double>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.float64x2x2_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast double* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X2_T]], %struct.float64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x double> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X2_T]], %struct.float64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x double> [[TMP6]] to <16 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x double>
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2lane.v2f64.p0i8(<2 x double> [[TMP8]], <2 x double> [[TMP9]], i64 1, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x double>, <2 x double> }*
+// CHECK-NEXT:    store { <2 x double>, <2 x double> } [[VLD2_LANE]], { <2 x double>, <2 x double> }* [[TMP10]], align 16
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast %struct.float64x2x2_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP13:%.*]] = load [[STRUCT_FLOAT64X2X2_T]], %struct.float64x2x2_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_FLOAT64X2X2_T]] [[TMP13]]
+//
 float64x2x2_t test_vld2q_lane_f64(float64_t  *a, float64x2x2_t b) {
   return vld2q_lane_f64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.poly16x8x2_t @test_vld2q_lane_p16(i16* %a, [2 x <8 x i16>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
-// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i8(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, i8* [[TMP3]])
-// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16> }*
-// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], { <8 x i16>, <8 x i16> }* [[TMP10]]
-// CHECK:   [[TMP11:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP12:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
-// CHECK:   [[TMP13:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.poly16x8x2_t [[TMP13]]
+// CHECK-LABEL: @test_vld2q_lane_p16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY16X8X2_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X8X2_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X8X2_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY16X8X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY16X8X2_T]], %struct.poly16x8x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <8 x i16>] [[B_COERCE:%.*]], [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY16X8X2_T]], %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY16X8X2_T]], %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i8(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16> }*
+// CHECK-NEXT:    store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], { <8 x i16>, <8 x i16> }* [[TMP10]], align 16
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP13:%.*]] = load [[STRUCT_POLY16X8X2_T]], %struct.poly16x8x2_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_POLY16X8X2_T]] [[TMP13]]
+//
 poly16x8x2_t test_vld2q_lane_p16(poly16_t  *a, poly16x8x2_t b) {
   return vld2q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.poly64x2x2_t @test_vld2q_lane_p64(i64* %a, [2 x <2 x i64>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x2_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.poly64x2x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
-// CHECK:   [[VLD2_LANE:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0i8(<2 x i64> [[TMP8]], <2 x i64> [[TMP9]], i64 1, i8* [[TMP3]])
-// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64> }*
-// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2_LANE]], { <2 x i64>, <2 x i64> }* [[TMP10]]
-// CHECK:   [[TMP11:%.*]] = bitcast %struct.poly64x2x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP12:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
-// CHECK:   [[TMP13:%.*]] = load %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.poly64x2x2_t [[TMP13]]
+// CHECK-LABEL: @test_vld2q_lane_p64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X2X2_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY64X2X2_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY64X2X2_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY64X2X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY64X2X2_T]], %struct.poly64x2x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <2 x i64>] [[B_COERCE:%.*]], [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.poly64x2x2_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY64X2X2_T]], %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY64X2X2_T]], %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0i8(<2 x i64> [[TMP8]], <2 x i64> [[TMP9]], i64 1, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64> }*
+// CHECK-NEXT:    store { <2 x i64>, <2 x i64> } [[VLD2_LANE]], { <2 x i64>, <2 x i64> }* [[TMP10]], align 16
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast %struct.poly64x2x2_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP11]], i8* align 16 [[TMP12]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP13:%.*]] = load [[STRUCT_POLY64X2X2_T]], %struct.poly64x2x2_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_POLY64X2X2_T]] [[TMP13]]
+//
 poly64x2x2_t test_vld2q_lane_p64(poly64_t  *a, poly64x2x2_t b) {
   return vld2q_lane_p64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.uint8x8x2_t @test_vld2_lane_u8(i8* %a, [2 x <8 x i8>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
-// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* %a)
-// CHECK:   [[TMP5:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8> }*
-// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], { <8 x i8>, <8 x i8> }* [[TMP5]]
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP6]], i8* align 8 [[TMP7]], i64 16, i1 false)
-// CHECK:   [[TMP8:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.uint8x8x2_t [[TMP8]]
+// CHECK-LABEL: @test_vld2_lane_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT8X8X2_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X8X2_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X8X2_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT8X8X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X2_T]], %struct.uint8x8x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <8 x i8>] [[B_COERCE:%.*]], [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X2_T]], %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X2_T]], %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* [[A:%.*]])
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8> }*
+// CHECK-NEXT:    store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], { <8 x i8>, <8 x i8> }* [[TMP5]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP6]], i8* align 8 [[TMP7]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP8:%.*]] = load [[STRUCT_UINT8X8X2_T]], %struct.uint8x8x2_t* [[RETVAL]], align 8
+// CHECK-NEXT:    ret [[STRUCT_UINT8X8X2_T]] [[TMP8]]
+//
 uint8x8x2_t test_vld2_lane_u8(uint8_t  *a, uint8x8x2_t b) {
   return vld2_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.uint16x4x2_t @test_vld2_lane_u16(i16* %a, [2 x <4 x i16>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
-// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i8(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, i8* [[TMP3]])
-// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], { <4 x i16>, <4 x i16> }* [[TMP10]]
-// CHECK:   [[TMP11:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
-// CHECK:   [[TMP13:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.uint16x4x2_t [[TMP13]]
+// CHECK-LABEL: @test_vld2_lane_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT16X4X2_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X4X2_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X4X2_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT16X4X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT16X4X2_T]], %struct.uint16x4x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <4 x i16>] [[B_COERCE:%.*]], [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT16X4X2_T]], %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT16X4X2_T]], %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i8(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16> }*
+// CHECK-NEXT:    store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], { <4 x i16>, <4 x i16> }* [[TMP10]], align 8
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP13:%.*]] = load [[STRUCT_UINT16X4X2_T]], %struct.uint16x4x2_t* [[RETVAL]], align 8
+// CHECK-NEXT:    ret [[STRUCT_UINT16X4X2_T]] [[TMP13]]
+//
 uint16x4x2_t test_vld2_lane_u16(uint16_t  *a, uint16x4x2_t b) {
   return vld2_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.uint32x2x2_t @test_vld2_lane_u32(i32* %a, [2 x <2 x i32>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
-// CHECK:   [[VLD2_LANE:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0i8(<2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i64 1, i8* [[TMP3]])
-// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32> }*
-// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2_LANE]], { <2 x i32>, <2 x i32> }* [[TMP10]]
-// CHECK:   [[TMP11:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
-// CHECK:   [[TMP13:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.uint32x2x2_t [[TMP13]]
+// CHECK-LABEL: @test_vld2_lane_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT32X2X2_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X2X2_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X2X2_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT32X2X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT32X2X2_T]], %struct.uint32x2x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <2 x i32>] [[B_COERCE:%.*]], [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT32X2X2_T]], %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT32X2X2_T]], %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0i8(<2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i64 1, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32> }*
+// CHECK-NEXT:    store { <2 x i32>, <2 x i32> } [[VLD2_LANE]], { <2 x i32>, <2 x i32> }* [[TMP10]], align 8
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP13:%.*]] = load [[STRUCT_UINT32X2X2_T]], %struct.uint32x2x2_t* [[RETVAL]], align 8
+// CHECK-NEXT:    ret [[STRUCT_UINT32X2X2_T]] [[TMP13]]
+//
 uint32x2x2_t test_vld2_lane_u32(uint32_t  *a, uint32x2x2_t b) {
   return vld2_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.uint64x1x2_t @test_vld2_lane_u64(i64* %a, [2 x <1 x i64>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x2_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
-// CHECK:   [[VLD2_LANE:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0i8(<1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i64 0, i8* [[TMP3]])
-// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64> }*
-// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2_LANE]], { <1 x i64>, <1 x i64> }* [[TMP10]]
-// CHECK:   [[TMP11:%.*]] = bitcast %struct.uint64x1x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
-// CHECK:   [[TMP13:%.*]] = load %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.uint64x1x2_t [[TMP13]]
+// CHECK-LABEL: @test_vld2_lane_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT64X1X2_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT64X1X2_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT64X1X2_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT64X1X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT64X1X2_T]], %struct.uint64x1x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <1 x i64>] [[B_COERCE:%.*]], [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.uint64x1x2_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT64X1X2_T]], %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT64X1X2_T]], %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0i8(<1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i64 0, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64> }*
+// CHECK-NEXT:    store { <1 x i64>, <1 x i64> } [[VLD2_LANE]], { <1 x i64>, <1 x i64> }* [[TMP10]], align 8
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast %struct.uint64x1x2_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP13:%.*]] = load [[STRUCT_UINT64X1X2_T]], %struct.uint64x1x2_t* [[RETVAL]], align 8
+// CHECK-NEXT:    ret [[STRUCT_UINT64X1X2_T]] [[TMP13]]
+//
 uint64x1x2_t test_vld2_lane_u64(uint64_t  *a, uint64x1x2_t b) {
   return vld2_lane_u64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.int8x8x2_t @test_vld2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
-// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* %a)
-// CHECK:   [[TMP5:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8> }*
-// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], { <8 x i8>, <8 x i8> }* [[TMP5]]
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP6]], i8* align 8 [[TMP7]], i64 16, i1 false)
-// CHECK:   [[TMP8:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.int8x8x2_t [[TMP8]]
+// CHECK-LABEL: @test_vld2_lane_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT8X8X2_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X8X2_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X8X2_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT8X8X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X2_T]], %struct.int8x8x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <8 x i8>] [[B_COERCE:%.*]], [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X2_T]], %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X2_T]], %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* [[A:%.*]])
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8> }*
+// CHECK-NEXT:    store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], { <8 x i8>, <8 x i8> }* [[TMP5]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP6]], i8* align 8 [[TMP7]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP8:%.*]] = load [[STRUCT_INT8X8X2_T]], %struct.int8x8x2_t* [[RETVAL]], align 8
+// CHECK-NEXT:    ret [[STRUCT_INT8X8X2_T]] [[TMP8]]
+//
 int8x8x2_t test_vld2_lane_s8(int8_t  *a, int8x8x2_t b) {
   return vld2_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.int16x4x2_t @test_vld2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
-// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i8(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, i8* [[TMP3]])
-// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], { <4 x i16>, <4 x i16> }* [[TMP10]]
-// CHECK:   [[TMP11:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP12:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
-// CHECK:   [[TMP13:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.int16x4x2_t [[TMP13]]
+// CHECK-LABEL: @test_vld2_lane_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT16X4X2_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X4X2_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X4X2_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT16X4X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT16X4X2_T]], %struct.int16x4x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <4 x i16>] [[B_COERCE:%.*]], [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT16X4X2_T]], %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT16X4X2_T]], %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i8(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16> }*
+// CHECK-NEXT:    store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], { <4 x i16>, <4 x i16> }* [[TMP10]], align 8
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP13:%.*]] = load [[STRUCT_INT16X4X2_T]], %struct.int16x4x2_t* [[RETVAL]], align 8
+// CHECK-NEXT:    ret [[STRUCT_INT16X4X2_T]] [[TMP13]]
+//
 int16x4x2_t test_vld2_lane_s16(int16_t  *a, int16x4x2_t b) {
   return vld2_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.int32x2x2_t @test_vld2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
-// CHECK:   [[VLD2_LANE:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0i8(<2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i64 1, i8* [[TMP3]])
-// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32> }*
-// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2_LANE]], { <2 x i32>, <2 x i32> }* [[TMP10]]
-// CHECK:   [[TMP11:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP12:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
-// CHECK:   [[TMP13:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.int32x2x2_t [[TMP13]]
+// CHECK-LABEL: @test_vld2_lane_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT32X2X2_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X2X2_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X2X2_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT32X2X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT32X2X2_T]], %struct.int32x2x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <2 x i32>] [[B_COERCE:%.*]], [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT32X2X2_T]], %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT32X2X2_T]], %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0i8(<2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i64 1, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32> }*
+// CHECK-NEXT:    store { <2 x i32>, <2 x i32> } [[VLD2_LANE]], { <2 x i32>, <2 x i32> }* [[TMP10]], align 8
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP13:%.*]] = load [[STRUCT_INT32X2X2_T]], %struct.int32x2x2_t* [[RETVAL]], align 8
+// CHECK-NEXT:    ret [[STRUCT_INT32X2X2_T]] [[TMP13]]
+//
 int32x2x2_t test_vld2_lane_s32(int32_t  *a, int32x2x2_t b) {
   return vld2_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.int64x1x2_t @test_vld2_lane_s64(i64* %a, [2 x <1 x i64>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x2_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.int64x1x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
-// CHECK:   [[VLD2_LANE:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0i8(<1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i64 0, i8* [[TMP3]])
-// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64> }*
-// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2_LANE]], { <1 x i64>, <1 x i64> }* [[TMP10]]
-// CHECK:   [[TMP11:%.*]] = bitcast %struct.int64x1x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP12:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
-// CHECK:   [[TMP13:%.*]] = load %struct.int64x1x2_t, %struct.int64x1x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.int64x1x2_t [[TMP13]]
+// CHECK-LABEL: @test_vld2_lane_s64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT64X1X2_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT64X1X2_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT64X1X2_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT64X1X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT64X1X2_T]], %struct.int64x1x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <1 x i64>] [[B_COERCE:%.*]], [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.int64x1x2_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT64X1X2_T]], %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT64X1X2_T]], %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0i8(<1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i64 0, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64> }*
+// CHECK-NEXT:    store { <1 x i64>, <1 x i64> } [[VLD2_LANE]], { <1 x i64>, <1 x i64> }* [[TMP10]], align 8
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast %struct.int64x1x2_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP13:%.*]] = load [[STRUCT_INT64X1X2_T]], %struct.int64x1x2_t* [[RETVAL]], align 8
+// CHECK-NEXT:    ret [[STRUCT_INT64X1X2_T]] [[TMP13]]
+//
 int64x1x2_t test_vld2_lane_s64(int64_t  *a, int64x1x2_t b) {
   return vld2_lane_s64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.float16x4x2_t @test_vld2_lane_f16(half* %a, [2 x <4 x half>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x2_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <4 x half>] [[B]].coerce, [2 x <4 x half>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
-// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2lane.v4f16.p0i8(<4 x half> [[TMP8]], <4 x half> [[TMP9]], i64 3, i8* [[TMP3]])
-// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x half>, <4 x half> }*
-// CHECK:   store { <4 x half>, <4 x half> } [[VLD2_LANE]], { <4 x half>, <4 x half> }* [[TMP10]]
-// CHECK:   [[TMP11:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP12:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
-// CHECK:   [[TMP13:%.*]] = load %struct.float16x4x2_t, %struct.float16x4x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.float16x4x2_t [[TMP13]]
+// CHECK-LABEL: @test_vld2_lane_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X2_T]], %struct.float16x4x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <4 x half>] [[B_COERCE:%.*]], [2 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast half* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X2_T]], %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X2_T]], %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2lane.v4f16.p0i8(<4 x half> [[TMP8]], <4 x half> [[TMP9]], i64 3, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x half>, <4 x half> }*
+// CHECK-NEXT:    store { <4 x half>, <4 x half> } [[VLD2_LANE]], { <4 x half>, <4 x half> }* [[TMP10]], align 8
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP13:%.*]] = load [[STRUCT_FLOAT16X4X2_T]], %struct.float16x4x2_t* [[RETVAL]], align 8
+// CHECK-NEXT:    ret [[STRUCT_FLOAT16X4X2_T]] [[TMP13]]
+//
 float16x4x2_t test_vld2_lane_f16(float16_t  *a, float16x4x2_t b) {
   return vld2_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.float32x2x2_t @test_vld2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <2 x float>] [[B]].coerce, [2 x <2 x float>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
-// CHECK:   [[VLD2_LANE:%.*]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2lane.v2f32.p0i8(<2 x float> [[TMP8]], <2 x float> [[TMP9]], i64 1, i8* [[TMP3]])
-// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x float>, <2 x float> }*
-// CHECK:   store { <2 x float>, <2 x float> } [[VLD2_LANE]], { <2 x float>, <2 x float> }* [[TMP10]]
-// CHECK:   [[TMP11:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP12:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
-// CHECK:   [[TMP13:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.float32x2x2_t [[TMP13]]
+// CHECK-LABEL: @test_vld2_lane_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT32X2X2_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X2X2_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X2X2_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X2X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X2X2_T]], %struct.float32x2x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <2 x float>] [[B_COERCE:%.*]], [2 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X2X2_T]], %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X2X2_T]], %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2lane.v2f32.p0i8(<2 x float> [[TMP8]], <2 x float> [[TMP9]], i64 1, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x float>, <2 x float> }*
+// CHECK-NEXT:    store { <2 x float>, <2 x float> } [[VLD2_LANE]], { <2 x float>, <2 x float> }* [[TMP10]], align 8
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP13:%.*]] = load [[STRUCT_FLOAT32X2X2_T]], %struct.float32x2x2_t* [[RETVAL]], align 8
+// CHECK-NEXT:    ret [[STRUCT_FLOAT32X2X2_T]] [[TMP13]]
+//
 float32x2x2_t test_vld2_lane_f32(float32_t  *a, float32x2x2_t b) {
   return vld2_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.float64x1x2_t @test_vld2_lane_f64(double* %a, [2 x <1 x double>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x2_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.float64x1x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <1 x double>] [[B]].coerce, [2 x <1 x double>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast double* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <1 x double> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <1 x double> [[TMP6]] to <8 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x double>
-// CHECK:   [[VLD2_LANE:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2lane.v1f64.p0i8(<1 x double> [[TMP8]], <1 x double> [[TMP9]], i64 0, i8* [[TMP3]])
-// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <1 x double>, <1 x double> }*
-// CHECK:   store { <1 x double>, <1 x double> } [[VLD2_LANE]], { <1 x double>, <1 x double> }* [[TMP10]]
-// CHECK:   [[TMP11:%.*]] = bitcast %struct.float64x1x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP12:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
-// CHECK:   [[TMP13:%.*]] = load %struct.float64x1x2_t, %struct.float64x1x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.float64x1x2_t [[TMP13]]
+// CHECK-LABEL: @test_vld2_lane_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X2_T]], %struct.float64x1x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <1 x double>] [[B_COERCE:%.*]], [2 x <1 x double>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.float64x1x2_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast double* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X2_T]], %struct.float64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x double> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X2_T]], %struct.float64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x double> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x double>
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2lane.v1f64.p0i8(<1 x double> [[TMP8]], <1 x double> [[TMP9]], i64 0, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <1 x double>, <1 x double> }*
+// CHECK-NEXT:    store { <1 x double>, <1 x double> } [[VLD2_LANE]], { <1 x double>, <1 x double> }* [[TMP10]], align 8
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast %struct.float64x1x2_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP13:%.*]] = load [[STRUCT_FLOAT64X1X2_T]], %struct.float64x1x2_t* [[RETVAL]], align 8
+// CHECK-NEXT:    ret [[STRUCT_FLOAT64X1X2_T]] [[TMP13]]
+//
 float64x1x2_t test_vld2_lane_f64(float64_t  *a, float64x1x2_t b) {
   return vld2_lane_f64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.poly8x8x2_t @test_vld2_lane_p8(i8* %a, [2 x <8 x i8>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
-// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* %a)
-// CHECK:   [[TMP5:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8> }*
-// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], { <8 x i8>, <8 x i8> }* [[TMP5]]
-// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP6]], i8* align 8 [[TMP7]], i64 16, i1 false)
-// CHECK:   [[TMP8:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.poly8x8x2_t [[TMP8]]
+// CHECK-LABEL: @test_vld2_lane_p8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY8X8X2_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X8X2_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X8X2_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY8X8X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X2_T]], %struct.poly8x8x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <8 x i8>] [[B_COERCE:%.*]], [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X2_T]], %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X2_T]], %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* [[A:%.*]])
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8> }*
+// CHECK-NEXT:    store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], { <8 x i8>, <8 x i8> }* [[TMP5]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP6]], i8* align 8 [[TMP7]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP8:%.*]] = load [[STRUCT_POLY8X8X2_T]], %struct.poly8x8x2_t* [[RETVAL]], align 8
+// CHECK-NEXT:    ret [[STRUCT_POLY8X8X2_T]] [[TMP8]]
+//
 poly8x8x2_t test_vld2_lane_p8(poly8_t  *a, poly8x8x2_t b) {
   return vld2_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.poly16x4x2_t @test_vld2_lane_p16(i16* %a, [2 x <4 x i16>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
-// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i8(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, i8* [[TMP3]])
-// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], { <4 x i16>, <4 x i16> }* [[TMP10]]
-// CHECK:   [[TMP11:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP12:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
-// CHECK:   [[TMP13:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.poly16x4x2_t [[TMP13]]
+// CHECK-LABEL: @test_vld2_lane_p16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY16X4X2_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X4X2_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X4X2_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY16X4X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY16X4X2_T]], %struct.poly16x4x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <4 x i16>] [[B_COERCE:%.*]], [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY16X4X2_T]], %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY16X4X2_T]], %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i8(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16> }*
+// CHECK-NEXT:    store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], { <4 x i16>, <4 x i16> }* [[TMP10]], align 8
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP13:%.*]] = load [[STRUCT_POLY16X4X2_T]], %struct.poly16x4x2_t* [[RETVAL]], align 8
+// CHECK-NEXT:    ret [[STRUCT_POLY16X4X2_T]] [[TMP13]]
+//
 poly16x4x2_t test_vld2_lane_p16(poly16_t  *a, poly16x4x2_t b) {
   return vld2_lane_p16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.poly64x1x2_t @test_vld2_lane_p64(i64* %a, [2 x <1 x i64>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x2_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.poly64x1x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x2_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
-// CHECK:   [[VLD2_LANE:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0i8(<1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i64 0, i8* [[TMP3]])
-// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64> }*
-// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2_LANE]], { <1 x i64>, <1 x i64> }* [[TMP10]]
-// CHECK:   [[TMP11:%.*]] = bitcast %struct.poly64x1x2_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP12:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
-// CHECK:   [[TMP13:%.*]] = load %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.poly64x1x2_t [[TMP13]]
+// CHECK-LABEL: @test_vld2_lane_p64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X1X2_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY64X1X2_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY64X1X2_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY64X1X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY64X1X2_T]], %struct.poly64x1x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <1 x i64>] [[B_COERCE:%.*]], [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.poly64x1x2_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY64X1X2_T]], %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY64X1X2_T]], %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK-NEXT:    [[VLD2_LANE:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0i8(<1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i64 0, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64> }*
+// CHECK-NEXT:    store { <1 x i64>, <1 x i64> } [[VLD2_LANE]], { <1 x i64>, <1 x i64> }* [[TMP10]], align 8
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast %struct.poly64x1x2_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP11]], i8* align 8 [[TMP12]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP13:%.*]] = load [[STRUCT_POLY64X1X2_T]], %struct.poly64x1x2_t* [[RETVAL]], align 8
+// CHECK-NEXT:    ret [[STRUCT_POLY64X1X2_T]] [[TMP13]]
+//
 poly64x1x2_t test_vld2_lane_p64(poly64_t  *a, poly64x1x2_t b) {
   return vld2_lane_p64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.uint16x8x3_t @test_vld3q_lane_u16(i16* %a, [3 x <8 x i16>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x3_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
-// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i8(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i64 7, i8* [[TMP3]])
-// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
-// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP13]]
-// CHECK:   [[TMP14:%.*]] = bitcast %struct.uint16x8x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP15:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
-// CHECK:   [[TMP16:%.*]] = load %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.uint16x8x3_t [[TMP16]]
+// CHECK-LABEL: @test_vld3q_lane_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT16X8X3_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X8X3_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X8X3_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT16X8X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT16X8X3_T]], %struct.uint16x8x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <8 x i16>] [[B_COERCE:%.*]], [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT16X8X3_T]], %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT16X8X3_T]], %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT16X8X3_T]], %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i8(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i64 7, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP13]], align 16
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast %struct.uint16x8x3_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP15:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = load [[STRUCT_UINT16X8X3_T]], %struct.uint16x8x3_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_UINT16X8X3_T]] [[TMP16]]
+//
 uint16x8x3_t test_vld3q_lane_u16(uint16_t  *a, uint16x8x3_t b) {
   return vld3q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.uint32x4x3_t @test_vld3q_lane_u32(i32* %a, [3 x <4 x i32>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x3_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
-// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i8(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i64 3, i8* [[TMP3]])
-// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
-// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP13]]
-// CHECK:   [[TMP14:%.*]] = bitcast %struct.uint32x4x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP15:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
-// CHECK:   [[TMP16:%.*]] = load %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.uint32x4x3_t [[TMP16]]
+// CHECK-LABEL: @test_vld3q_lane_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT32X4X3_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X4X3_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X4X3_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT32X4X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT32X4X3_T]], %struct.uint32x4x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <4 x i32>] [[B_COERCE:%.*]], [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT32X4X3_T]], %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT32X4X3_T]], %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT32X4X3_T]], %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i8(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i64 3, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP13]], align 16
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast %struct.uint32x4x3_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP15:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = load [[STRUCT_UINT32X4X3_T]], %struct.uint32x4x3_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_UINT32X4X3_T]] [[TMP16]]
+//
 uint32x4x3_t test_vld3q_lane_u32(uint32_t  *a, uint32x4x3_t b) {
   return vld3q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.uint64x2x3_t @test_vld3q_lane_u64(i64* %a, [3 x <2 x i64>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x3_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
-// CHECK:   [[VLD3_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0i8(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]], <2 x i64> [[TMP12]], i64 1, i8* [[TMP3]])
-// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
-// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP13]]
-// CHECK:   [[TMP14:%.*]] = bitcast %struct.uint64x2x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP15:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
-// CHECK:   [[TMP16:%.*]] = load %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.uint64x2x3_t [[TMP16]]
+// CHECK-LABEL: @test_vld3q_lane_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT64X2X3_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT64X2X3_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT64X2X3_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT64X2X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT64X2X3_T]], %struct.uint64x2x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <2 x i64>] [[B_COERCE:%.*]], [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.uint64x2x3_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT64X2X3_T]], %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT64X2X3_T]], %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT64X2X3_T]], %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0i8(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]], <2 x i64> [[TMP12]], i64 1, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP13]], align 16
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast %struct.uint64x2x3_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP15:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = load [[STRUCT_UINT64X2X3_T]], %struct.uint64x2x3_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_UINT64X2X3_T]] [[TMP16]]
+//
 uint64x2x3_t test_vld3q_lane_u64(uint64_t  *a, uint64x2x3_t b) {
   return vld3q_lane_u64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.int16x8x3_t @test_vld3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x3_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
-// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i8(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i64 7, i8* [[TMP3]])
-// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
-// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP13]]
-// CHECK:   [[TMP14:%.*]] = bitcast %struct.int16x8x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP15:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
-// CHECK:   [[TMP16:%.*]] = load %struct.int16x8x3_t, %struct.int16x8x3_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.int16x8x3_t [[TMP16]]
+// CHECK-LABEL: @test_vld3q_lane_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT16X8X3_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X8X3_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X8X3_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT16X8X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X3_T]], %struct.int16x8x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <8 x i16>] [[B_COERCE:%.*]], [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X3_T]], %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X3_T]], %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X3_T]], %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i8(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i64 7, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP13]], align 16
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast %struct.int16x8x3_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP15:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = load [[STRUCT_INT16X8X3_T]], %struct.int16x8x3_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_INT16X8X3_T]] [[TMP16]]
+//
 int16x8x3_t test_vld3q_lane_s16(int16_t  *a, int16x8x3_t b) {
   return vld3q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.int32x4x3_t @test_vld3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x3_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
-// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i8(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i64 3, i8* [[TMP3]])
-// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
-// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP13]]
-// CHECK:   [[TMP14:%.*]] = bitcast %struct.int32x4x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP15:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
-// CHECK:   [[TMP16:%.*]] = load %struct.int32x4x3_t, %struct.int32x4x3_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.int32x4x3_t [[TMP16]]
+// CHECK-LABEL: @test_vld3q_lane_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT32X4X3_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X4X3_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X4X3_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT32X4X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT32X4X3_T]], %struct.int32x4x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <4 x i32>] [[B_COERCE:%.*]], [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT32X4X3_T]], %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT32X4X3_T]], %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT32X4X3_T]], %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i8(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i64 3, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP13]], align 16
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast %struct.int32x4x3_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP15:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = load [[STRUCT_INT32X4X3_T]], %struct.int32x4x3_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_INT32X4X3_T]] [[TMP16]]
+//
 int32x4x3_t test_vld3q_lane_s32(int32_t  *a, int32x4x3_t b) {
   return vld3q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.int64x2x3_t @test_vld3q_lane_s64(i64* %a, [3 x <2 x i64>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x3_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.int64x2x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
-// CHECK:   [[VLD3_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0i8(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]], <2 x i64> [[TMP12]], i64 1, i8* [[TMP3]])
-// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
-// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP13]]
-// CHECK:   [[TMP14:%.*]] = bitcast %struct.int64x2x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP15:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
-// CHECK:   [[TMP16:%.*]] = load %struct.int64x2x3_t, %struct.int64x2x3_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.int64x2x3_t [[TMP16]]
+// CHECK-LABEL: @test_vld3q_lane_s64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT64X2X3_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT64X2X3_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT64X2X3_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT64X2X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT64X2X3_T]], %struct.int64x2x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <2 x i64>] [[B_COERCE:%.*]], [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.int64x2x3_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT64X2X3_T]], %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT64X2X3_T]], %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT64X2X3_T]], %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0i8(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]], <2 x i64> [[TMP12]], i64 1, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP13]], align 16
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast %struct.int64x2x3_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP15:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = load [[STRUCT_INT64X2X3_T]], %struct.int64x2x3_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_INT64X2X3_T]] [[TMP16]]
+//
 int64x2x3_t test_vld3q_lane_s64(int64_t  *a, int64x2x3_t b) {
   return vld3q_lane_s64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.float16x8x3_t @test_vld3q_lane_f16(half* %a, [3 x <8 x half>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x3_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <8 x half>] [[B]].coerce, [3 x <8 x half>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x half>
-// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3lane.v8f16.p0i8(<8 x half> [[TMP10]], <8 x half> [[TMP11]], <8 x half> [[TMP12]], i64 7, i8* [[TMP3]])
-// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <8 x half>, <8 x half>, <8 x half> }*
-// CHECK:   store { <8 x half>, <8 x half>, <8 x half> } [[VLD3_LANE]], { <8 x half>, <8 x half>, <8 x half> }* [[TMP13]]
-// CHECK:   [[TMP14:%.*]] = bitcast %struct.float16x8x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP15:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
-// CHECK:   [[TMP16:%.*]] = load %struct.float16x8x3_t, %struct.float16x8x3_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.float16x8x3_t [[TMP16]]
+// CHECK-LABEL: @test_vld3q_lane_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X8X3_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X8X3_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X8X3_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X8X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X3_T]], %struct.float16x8x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <8 x half>] [[B_COERCE:%.*]], [3 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast half* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X3_T]], %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X3_T]], %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X3_T]], %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x half>
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3lane.v8f16.p0i8(<8 x half> [[TMP10]], <8 x half> [[TMP11]], <8 x half> [[TMP12]], i64 7, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <8 x half>, <8 x half>, <8 x half> }*
+// CHECK-NEXT:    store { <8 x half>, <8 x half>, <8 x half> } [[VLD3_LANE]], { <8 x half>, <8 x half>, <8 x half> }* [[TMP13]], align 16
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast %struct.float16x8x3_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP15:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = load [[STRUCT_FLOAT16X8X3_T]], %struct.float16x8x3_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_FLOAT16X8X3_T]] [[TMP16]]
+//
 float16x8x3_t test_vld3q_lane_f16(float16_t  *a, float16x8x3_t b) {
   return vld3q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.float32x4x3_t @test_vld3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x3_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <4 x float>] [[B]].coerce, [3 x <4 x float>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
-// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3lane.v4f32.p0i8(<4 x float> [[TMP10]], <4 x float> [[TMP11]], <4 x float> [[TMP12]], i64 3, i8* [[TMP3]])
-// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x float>, <4 x float>, <4 x float> }*
-// CHECK:   store { <4 x float>, <4 x float>, <4 x float> } [[VLD3_LANE]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP13]]
-// CHECK:   [[TMP14:%.*]] = bitcast %struct.float32x4x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP15:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
-// CHECK:   [[TMP16:%.*]] = load %struct.float32x4x3_t, %struct.float32x4x3_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.float32x4x3_t [[TMP16]]
+// CHECK-LABEL: @test_vld3q_lane_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT32X4X3_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X4X3_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X4X3_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X4X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X4X3_T]], %struct.float32x4x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <4 x float>] [[B_COERCE:%.*]], [3 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X4X3_T]], %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X4X3_T]], %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X4X3_T]], %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3lane.v4f32.p0i8(<4 x float> [[TMP10]], <4 x float> [[TMP11]], <4 x float> [[TMP12]], i64 3, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x float>, <4 x float>, <4 x float> }*
+// CHECK-NEXT:    store { <4 x float>, <4 x float>, <4 x float> } [[VLD3_LANE]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP13]], align 16
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast %struct.float32x4x3_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP15:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = load [[STRUCT_FLOAT32X4X3_T]], %struct.float32x4x3_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_FLOAT32X4X3_T]] [[TMP16]]
+//
 float32x4x3_t test_vld3q_lane_f32(float32_t  *a, float32x4x3_t b) {
   return vld3q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.float64x2x3_t @test_vld3q_lane_f64(double* %a, [3 x <2 x double>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x3_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.float64x2x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <2 x double>] [[B]].coerce, [3 x <2 x double>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast double* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x double> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x double> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <2 x double> [[TMP8]] to <16 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x double>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x double>
-// CHECK:   [[VLD3_LANE:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3lane.v2f64.p0i8(<2 x double> [[TMP10]], <2 x double> [[TMP11]], <2 x double> [[TMP12]], i64 1, i8* [[TMP3]])
-// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x double>, <2 x double>, <2 x double> }*
-// CHECK:   store { <2 x double>, <2 x double>, <2 x double> } [[VLD3_LANE]], { <2 x double>, <2 x double>, <2 x double> }* [[TMP13]]
-// CHECK:   [[TMP14:%.*]] = bitcast %struct.float64x2x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP15:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
-// CHECK:   [[TMP16:%.*]] = load %struct.float64x2x3_t, %struct.float64x2x3_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.float64x2x3_t [[TMP16]]
+// CHECK-LABEL: @test_vld3q_lane_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X3_T]], %struct.float64x2x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <2 x double>] [[B_COERCE:%.*]], [3 x <2 x double>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.float64x2x3_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast double* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X3_T]], %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x double> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X3_T]], %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x double> [[TMP6]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X3_T]], %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP8:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x double> [[TMP8]] to <16 x i8>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x double>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x double>
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3lane.v2f64.p0i8(<2 x double> [[TMP10]], <2 x double> [[TMP11]], <2 x double> [[TMP12]], i64 1, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x double>, <2 x double>, <2 x double> }*
+// CHECK-NEXT:    store { <2 x double>, <2 x double>, <2 x double> } [[VLD3_LANE]], { <2 x double>, <2 x double>, <2 x double> }* [[TMP13]], align 16
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast %struct.float64x2x3_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP15:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = load [[STRUCT_FLOAT64X2X3_T]], %struct.float64x2x3_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_FLOAT64X2X3_T]] [[TMP16]]
+//
 float64x2x3_t test_vld3q_lane_f64(float64_t  *a, float64x2x3_t b) {
   return vld3q_lane_f64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.poly8x16x3_t @test_vld3q_lane_p8(i8* %a, [3 x <16 x i8>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x3_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
-// CHECK:   [[VLD3_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, i8* %a)
-// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
-// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly8x16x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP7]], i8* align 16 [[TMP8]], i64 48, i1 false)
-// CHECK:   [[TMP9:%.*]] = load %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.poly8x16x3_t [[TMP9]]
+// CHECK-LABEL: @test_vld3q_lane_p8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY8X16X3_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X16X3_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X16X3_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY8X16X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X3_T]], %struct.poly8x16x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <16 x i8>] [[B_COERCE:%.*]], [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.poly8x16x3_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X3_T]], %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X3_T]], %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X3_T]], %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, i8* [[A:%.*]])
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP6]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast %struct.poly8x16x3_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP7]], i8* align 16 [[TMP8]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP9:%.*]] = load [[STRUCT_POLY8X16X3_T]], %struct.poly8x16x3_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_POLY8X16X3_T]] [[TMP9]]
+//
 poly8x16x3_t test_vld3q_lane_p8(poly8_t  *a, poly8x16x3_t b) {
   return vld3q_lane_p8(a, b, 15);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.poly16x8x3_t @test_vld3q_lane_p16(i16* %a, [3 x <8 x i16>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x3_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
-// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i8(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i64 7, i8* [[TMP3]])
-// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
-// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP13]]
-// CHECK:   [[TMP14:%.*]] = bitcast %struct.poly16x8x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP15:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
-// CHECK:   [[TMP16:%.*]] = load %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.poly16x8x3_t [[TMP16]]
+// CHECK-LABEL: @test_vld3q_lane_p16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY16X8X3_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X8X3_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X8X3_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY16X8X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY16X8X3_T]], %struct.poly16x8x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <8 x i16>] [[B_COERCE:%.*]], [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY16X8X3_T]], %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY16X8X3_T]], %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_POLY16X8X3_T]], %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i8(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i64 7, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP13]], align 16
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast %struct.poly16x8x3_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP15:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = load [[STRUCT_POLY16X8X3_T]], %struct.poly16x8x3_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_POLY16X8X3_T]] [[TMP16]]
+//
 poly16x8x3_t test_vld3q_lane_p16(poly16_t  *a, poly16x8x3_t b) {
   return vld3q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.poly64x2x3_t @test_vld3q_lane_p64(i64* %a, [3 x <2 x i64>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x3_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.poly64x2x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
-// CHECK:   [[VLD3_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0i8(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]], <2 x i64> [[TMP12]], i64 1, i8* [[TMP3]])
-// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
-// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP13]]
-// CHECK:   [[TMP14:%.*]] = bitcast %struct.poly64x2x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP15:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
-// CHECK:   [[TMP16:%.*]] = load %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.poly64x2x3_t [[TMP16]]
+// CHECK-LABEL: @test_vld3q_lane_p64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X2X3_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY64X2X3_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY64X2X3_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY64X2X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY64X2X3_T]], %struct.poly64x2x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <2 x i64>] [[B_COERCE:%.*]], [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.poly64x2x3_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY64X2X3_T]], %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY64X2X3_T]], %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_POLY64X2X3_T]], %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0i8(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]], <2 x i64> [[TMP12]], i64 1, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP13]], align 16
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast %struct.poly64x2x3_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP15:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP14]], i8* align 16 [[TMP15]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = load [[STRUCT_POLY64X2X3_T]], %struct.poly64x2x3_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_POLY64X2X3_T]] [[TMP16]]
+//
 poly64x2x3_t test_vld3q_lane_p64(poly64_t  *a, poly64x2x3_t b) {
   return vld3q_lane_p64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.uint8x8x3_t @test_vld3_lane_u8(i8* %a, [3 x <8 x i8>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x3_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
-// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* %a)
-// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
-// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint8x8x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP7]], i8* align 8 [[TMP8]], i64 24, i1 false)
-// CHECK:   [[TMP9:%.*]] = load %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.uint8x8x3_t [[TMP9]]
+// CHECK-LABEL: @test_vld3_lane_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT8X8X3_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X8X3_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X8X3_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT8X8X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X3_T]], %struct.uint8x8x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <8 x i8>] [[B_COERCE:%.*]], [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X3_T]], %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X3_T]], %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X3_T]], %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* [[A:%.*]])
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP6]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast %struct.uint8x8x3_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP7]], i8* align 8 [[TMP8]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP9:%.*]] = load [[STRUCT_UINT8X8X3_T]], %struct.uint8x8x3_t* [[RETVAL]], align 8
+// CHECK-NEXT:    ret [[STRUCT_UINT8X8X3_T]] [[TMP9]]
+//
 uint8x8x3_t test_vld3_lane_u8(uint8_t  *a, uint8x8x3_t b) {
   return vld3_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.uint16x4x3_t @test_vld3_lane_u16(i16* %a, [3 x <4 x i16>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x3_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
-// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i8(<4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i64 3, i8* [[TMP3]])
-// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP13]]
-// CHECK:   [[TMP14:%.*]] = bitcast %struct.uint16x4x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP15:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
-// CHECK:   [[TMP16:%.*]] = load %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.uint16x4x3_t [[TMP16]]
+// CHECK-LABEL: @test_vld3_lane_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT16X4X3_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X4X3_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X4X3_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT16X4X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT16X4X3_T]], %struct.uint16x4x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <4 x i16>] [[B_COERCE:%.*]], [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT16X4X3_T]], %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT16X4X3_T]], %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT16X4X3_T]], %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i8(<4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i64 3, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP13]], align 8
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast %struct.uint16x4x3_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP15:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = load [[STRUCT_UINT16X4X3_T]], %struct.uint16x4x3_t* [[RETVAL]], align 8
+// CHECK-NEXT:    ret [[STRUCT_UINT16X4X3_T]] [[TMP16]]
+//
 uint16x4x3_t test_vld3_lane_u16(uint16_t  *a, uint16x4x3_t b) {
   return vld3_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.uint32x2x3_t @test_vld3_lane_u32(i32* %a, [3 x <2 x i32>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x3_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
-// CHECK:   [[VLD3_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0i8(<2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i64 1, i8* [[TMP3]])
-// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
-// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP13]]
-// CHECK:   [[TMP14:%.*]] = bitcast %struct.uint32x2x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP15:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
-// CHECK:   [[TMP16:%.*]] = load %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.uint32x2x3_t [[TMP16]]
+// CHECK-LABEL: @test_vld3_lane_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT32X2X3_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X2X3_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X2X3_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT32X2X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT32X2X3_T]], %struct.uint32x2x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <2 x i32>] [[B_COERCE:%.*]], [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT32X2X3_T]], %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT32X2X3_T]], %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT32X2X3_T]], %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0i8(<2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i64 1, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP13]], align 8
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast %struct.uint32x2x3_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP15:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = load [[STRUCT_UINT32X2X3_T]], %struct.uint32x2x3_t* [[RETVAL]], align 8
+// CHECK-NEXT:    ret [[STRUCT_UINT32X2X3_T]] [[TMP16]]
+//
 uint32x2x3_t test_vld3_lane_u32(uint32_t  *a, uint32x2x3_t b) {
   return vld3_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.uint64x1x3_t @test_vld3_lane_u64(i64* %a, [3 x <1 x i64>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x3_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-// CHECK:   [[VLD3_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0i8(<1 x i64> [[TMP10]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], i64 0, i8* [[TMP3]])
-// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
-// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP13]]
-// CHECK:   [[TMP14:%.*]] = bitcast %struct.uint64x1x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP15:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
-// CHECK:   [[TMP16:%.*]] = load %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.uint64x1x3_t [[TMP16]]
+// CHECK-LABEL: @test_vld3_lane_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT64X1X3_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT64X1X3_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT64X1X3_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT64X1X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT64X1X3_T]], %struct.uint64x1x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <1 x i64>] [[B_COERCE:%.*]], [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.uint64x1x3_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT64X1X3_T]], %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT64X1X3_T]], %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT64X1X3_T]], %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0i8(<1 x i64> [[TMP10]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], i64 0, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP13]], align 8
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast %struct.uint64x1x3_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP15:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = load [[STRUCT_UINT64X1X3_T]], %struct.uint64x1x3_t* [[RETVAL]], align 8
+// CHECK-NEXT:    ret [[STRUCT_UINT64X1X3_T]] [[TMP16]]
+//
 uint64x1x3_t test_vld3_lane_u64(uint64_t  *a, uint64x1x3_t b) {
   return vld3_lane_u64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.int8x8x3_t @test_vld3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x3_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
-// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* %a)
-// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
-// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.int8x8x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP7]], i8* align 8 [[TMP8]], i64 24, i1 false)
-// CHECK:   [[TMP9:%.*]] = load %struct.int8x8x3_t, %struct.int8x8x3_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.int8x8x3_t [[TMP9]]
+// CHECK-LABEL: @test_vld3_lane_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT8X8X3_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X8X3_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X8X3_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT8X8X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X3_T]], %struct.int8x8x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <8 x i8>] [[B_COERCE:%.*]], [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X3_T]], %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X3_T]], %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X3_T]], %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* [[A:%.*]])
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP6]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast %struct.int8x8x3_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP7]], i8* align 8 [[TMP8]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP9:%.*]] = load [[STRUCT_INT8X8X3_T]], %struct.int8x8x3_t* [[RETVAL]], align 8
+// CHECK-NEXT:    ret [[STRUCT_INT8X8X3_T]] [[TMP9]]
+//
 int8x8x3_t test_vld3_lane_s8(int8_t  *a, int8x8x3_t b) {
   return vld3_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.int16x4x3_t @test_vld3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x3_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
-// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i8(<4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i64 3, i8* [[TMP3]])
-// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP13]]
-// CHECK:   [[TMP14:%.*]] = bitcast %struct.int16x4x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP15:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
-// CHECK:   [[TMP16:%.*]] = load %struct.int16x4x3_t, %struct.int16x4x3_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.int16x4x3_t [[TMP16]]
+// CHECK-LABEL: @test_vld3_lane_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT16X4X3_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X4X3_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X4X3_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT16X4X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT16X4X3_T]], %struct.int16x4x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <4 x i16>] [[B_COERCE:%.*]], [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT16X4X3_T]], %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT16X4X3_T]], %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT16X4X3_T]], %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i8(<4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i64 3, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP13]], align 8
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast %struct.int16x4x3_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP15:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = load [[STRUCT_INT16X4X3_T]], %struct.int16x4x3_t* [[RETVAL]], align 8
+// CHECK-NEXT:    ret [[STRUCT_INT16X4X3_T]] [[TMP16]]
+//
 int16x4x3_t test_vld3_lane_s16(int16_t  *a, int16x4x3_t b) {
   return vld3_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.int32x2x3_t @test_vld3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x3_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
-// CHECK:   [[VLD3_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0i8(<2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i64 1, i8* [[TMP3]])
-// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
-// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP13]]
-// CHECK:   [[TMP14:%.*]] = bitcast %struct.int32x2x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP15:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
-// CHECK:   [[TMP16:%.*]] = load %struct.int32x2x3_t, %struct.int32x2x3_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.int32x2x3_t [[TMP16]]
+// CHECK-LABEL: @test_vld3_lane_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT32X2X3_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X2X3_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X2X3_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT32X2X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT32X2X3_T]], %struct.int32x2x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <2 x i32>] [[B_COERCE:%.*]], [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT32X2X3_T]], %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT32X2X3_T]], %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT32X2X3_T]], %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0i8(<2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i64 1, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP13]], align 8
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast %struct.int32x2x3_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP15:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = load [[STRUCT_INT32X2X3_T]], %struct.int32x2x3_t* [[RETVAL]], align 8
+// CHECK-NEXT:    ret [[STRUCT_INT32X2X3_T]] [[TMP16]]
+//
 int32x2x3_t test_vld3_lane_s32(int32_t  *a, int32x2x3_t b) {
   return vld3_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.int64x1x3_t @test_vld3_lane_s64(i64* %a, [3 x <1 x i64>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x3_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.int64x1x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-// CHECK:   [[VLD3_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0i8(<1 x i64> [[TMP10]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], i64 0, i8* [[TMP3]])
-// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
-// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP13]]
-// CHECK:   [[TMP14:%.*]] = bitcast %struct.int64x1x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP15:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
-// CHECK:   [[TMP16:%.*]] = load %struct.int64x1x3_t, %struct.int64x1x3_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.int64x1x3_t [[TMP16]]
+// CHECK-LABEL: @test_vld3_lane_s64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT64X1X3_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT64X1X3_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT64X1X3_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT64X1X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT64X1X3_T]], %struct.int64x1x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <1 x i64>] [[B_COERCE:%.*]], [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.int64x1x3_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT64X1X3_T]], %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT64X1X3_T]], %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT64X1X3_T]], %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0i8(<1 x i64> [[TMP10]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], i64 0, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP13]], align 8
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast %struct.int64x1x3_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP15:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = load [[STRUCT_INT64X1X3_T]], %struct.int64x1x3_t* [[RETVAL]], align 8
+// CHECK-NEXT:    ret [[STRUCT_INT64X1X3_T]] [[TMP16]]
+//
 int64x1x3_t test_vld3_lane_s64(int64_t  *a, int64x1x3_t b) {
   return vld3_lane_s64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.float16x4x3_t @test_vld3_lane_f16(half* %a, [3 x <4 x half>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x3_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <4 x half>] [[B]].coerce, [3 x <4 x half>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x half>
-// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3lane.v4f16.p0i8(<4 x half> [[TMP10]], <4 x half> [[TMP11]], <4 x half> [[TMP12]], i64 3, i8* [[TMP3]])
-// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x half>, <4 x half>, <4 x half> }*
-// CHECK:   store { <4 x half>, <4 x half>, <4 x half> } [[VLD3_LANE]], { <4 x half>, <4 x half>, <4 x half> }* [[TMP13]]
-// CHECK:   [[TMP14:%.*]] = bitcast %struct.float16x4x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP15:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
-// CHECK:   [[TMP16:%.*]] = load %struct.float16x4x3_t, %struct.float16x4x3_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.float16x4x3_t [[TMP16]]
+// CHECK-LABEL: @test_vld3_lane_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X4X3_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X4X3_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X4X3_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X4X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X3_T]], %struct.float16x4x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <4 x half>] [[B_COERCE:%.*]], [3 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast half* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X3_T]], %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X3_T]], %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X3_T]], %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x half>
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3lane.v4f16.p0i8(<4 x half> [[TMP10]], <4 x half> [[TMP11]], <4 x half> [[TMP12]], i64 3, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x half>, <4 x half>, <4 x half> }*
+// CHECK-NEXT:    store { <4 x half>, <4 x half>, <4 x half> } [[VLD3_LANE]], { <4 x half>, <4 x half>, <4 x half> }* [[TMP13]], align 8
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast %struct.float16x4x3_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP15:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = load [[STRUCT_FLOAT16X4X3_T]], %struct.float16x4x3_t* [[RETVAL]], align 8
+// CHECK-NEXT:    ret [[STRUCT_FLOAT16X4X3_T]] [[TMP16]]
+//
 float16x4x3_t test_vld3_lane_f16(float16_t  *a, float16x4x3_t b) {
   return vld3_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.float32x2x3_t @test_vld3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x3_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <2 x float>] [[B]].coerce, [3 x <2 x float>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
-// CHECK:   [[VLD3_LANE:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3lane.v2f32.p0i8(<2 x float> [[TMP10]], <2 x float> [[TMP11]], <2 x float> [[TMP12]], i64 1, i8* [[TMP3]])
-// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x float>, <2 x float>, <2 x float> }*
-// CHECK:   store { <2 x float>, <2 x float>, <2 x float> } [[VLD3_LANE]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP13]]
-// CHECK:   [[TMP14:%.*]] = bitcast %struct.float32x2x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP15:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
-// CHECK:   [[TMP16:%.*]] = load %struct.float32x2x3_t, %struct.float32x2x3_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.float32x2x3_t [[TMP16]]
+// CHECK-LABEL: @test_vld3_lane_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT32X2X3_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X2X3_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X2X3_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X2X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X2X3_T]], %struct.float32x2x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <2 x float>] [[B_COERCE:%.*]], [3 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X2X3_T]], %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X2X3_T]], %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X2X3_T]], %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP8:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3lane.v2f32.p0i8(<2 x float> [[TMP10]], <2 x float> [[TMP11]], <2 x float> [[TMP12]], i64 1, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x float>, <2 x float>, <2 x float> }*
+// CHECK-NEXT:    store { <2 x float>, <2 x float>, <2 x float> } [[VLD3_LANE]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP13]], align 8
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast %struct.float32x2x3_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP15:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = load [[STRUCT_FLOAT32X2X3_T]], %struct.float32x2x3_t* [[RETVAL]], align 8
+// CHECK-NEXT:    ret [[STRUCT_FLOAT32X2X3_T]] [[TMP16]]
+//
 float32x2x3_t test_vld3_lane_f32(float32_t  *a, float32x2x3_t b) {
   return vld3_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.float64x1x3_t @test_vld3_lane_f64(double* %a, [3 x <1 x double>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x3_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.float64x1x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <1 x double>] [[B]].coerce, [3 x <1 x double>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast double* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <1 x double> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <1 x double> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <1 x double> [[TMP8]] to <8 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x double>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x double>
-// CHECK:   [[VLD3_LANE:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3lane.v1f64.p0i8(<1 x double> [[TMP10]], <1 x double> [[TMP11]], <1 x double> [[TMP12]], i64 0, i8* [[TMP3]])
-// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <1 x double>, <1 x double>, <1 x double> }*
-// CHECK:   store { <1 x double>, <1 x double>, <1 x double> } [[VLD3_LANE]], { <1 x double>, <1 x double>, <1 x double> }* [[TMP13]]
-// CHECK:   [[TMP14:%.*]] = bitcast %struct.float64x1x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP15:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
-// CHECK:   [[TMP16:%.*]] = load %struct.float64x1x3_t, %struct.float64x1x3_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.float64x1x3_t [[TMP16]]
+// CHECK-LABEL: @test_vld3_lane_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X3_T]], %struct.float64x1x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <1 x double>] [[B_COERCE:%.*]], [3 x <1 x double>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.float64x1x3_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast double* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X3_T]], %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x double> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X3_T]], %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x double> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X3_T]], %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP8:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x double> [[TMP8]] to <8 x i8>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x double>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x double>
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3lane.v1f64.p0i8(<1 x double> [[TMP10]], <1 x double> [[TMP11]], <1 x double> [[TMP12]], i64 0, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <1 x double>, <1 x double>, <1 x double> }*
+// CHECK-NEXT:    store { <1 x double>, <1 x double>, <1 x double> } [[VLD3_LANE]], { <1 x double>, <1 x double>, <1 x double> }* [[TMP13]], align 8
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast %struct.float64x1x3_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP15:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = load [[STRUCT_FLOAT64X1X3_T]], %struct.float64x1x3_t* [[RETVAL]], align 8
+// CHECK-NEXT:    ret [[STRUCT_FLOAT64X1X3_T]] [[TMP16]]
+//
 float64x1x3_t test_vld3_lane_f64(float64_t  *a, float64x1x3_t b) {
   return vld3_lane_f64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.poly8x8x3_t @test_vld3_lane_p8(i8* %a, [3 x <8 x i8>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x3_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
-// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* %a)
-// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
-// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP6]]
-// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly8x8x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP7]], i8* align 8 [[TMP8]], i64 24, i1 false)
-// CHECK:   [[TMP9:%.*]] = load %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.poly8x8x3_t [[TMP9]]
+// CHECK-LABEL: @test_vld3_lane_p8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY8X8X3_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X8X3_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X8X3_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY8X8X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X3_T]], %struct.poly8x8x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <8 x i8>] [[B_COERCE:%.*]], [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X3_T]], %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X3_T]], %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X3_T]], %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* [[A:%.*]])
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP6]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast %struct.poly8x8x3_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP7]], i8* align 8 [[TMP8]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP9:%.*]] = load [[STRUCT_POLY8X8X3_T]], %struct.poly8x8x3_t* [[RETVAL]], align 8
+// CHECK-NEXT:    ret [[STRUCT_POLY8X8X3_T]] [[TMP9]]
+//
 poly8x8x3_t test_vld3_lane_p8(poly8_t  *a, poly8x8x3_t b) {
   return vld3_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.poly16x4x3_t @test_vld3_lane_p16(i16* %a, [3 x <4 x i16>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x3_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
-// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i8(<4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i64 3, i8* [[TMP3]])
-// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP13]]
-// CHECK:   [[TMP14:%.*]] = bitcast %struct.poly16x4x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP15:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
-// CHECK:   [[TMP16:%.*]] = load %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.poly16x4x3_t [[TMP16]]
+// CHECK-LABEL: @test_vld3_lane_p16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY16X4X3_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X4X3_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X4X3_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY16X4X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY16X4X3_T]], %struct.poly16x4x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <4 x i16>] [[B_COERCE:%.*]], [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY16X4X3_T]], %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY16X4X3_T]], %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_POLY16X4X3_T]], %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i8(<4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i64 3, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP13]], align 8
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast %struct.poly16x4x3_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP15:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = load [[STRUCT_POLY16X4X3_T]], %struct.poly16x4x3_t* [[RETVAL]], align 8
+// CHECK-NEXT:    ret [[STRUCT_POLY16X4X3_T]] [[TMP16]]
+//
 poly16x4x3_t test_vld3_lane_p16(poly16_t  *a, poly16x4x3_t b) {
   return vld3_lane_p16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.poly64x1x3_t @test_vld3_lane_p64(i64* %a, [3 x <1 x i64>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x3_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.poly64x1x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x3_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-// CHECK:   [[VLD3_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0i8(<1 x i64> [[TMP10]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], i64 0, i8* [[TMP3]])
-// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
-// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP13]]
-// CHECK:   [[TMP14:%.*]] = bitcast %struct.poly64x1x3_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP15:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
-// CHECK:   [[TMP16:%.*]] = load %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.poly64x1x3_t [[TMP16]]
+// CHECK-LABEL: @test_vld3_lane_p64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X1X3_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY64X1X3_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY64X1X3_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY64X1X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY64X1X3_T]], %struct.poly64x1x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <1 x i64>] [[B_COERCE:%.*]], [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.poly64x1x3_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY64X1X3_T]], %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY64X1X3_T]], %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_POLY64X1X3_T]], %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
+// CHECK-NEXT:    [[VLD3_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0i8(<1 x i64> [[TMP10]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], i64 0, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP13]], align 8
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast %struct.poly64x1x3_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP15:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP14]], i8* align 8 [[TMP15]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP16:%.*]] = load [[STRUCT_POLY64X1X3_T]], %struct.poly64x1x3_t* [[RETVAL]], align 8
+// CHECK-NEXT:    ret [[STRUCT_POLY64X1X3_T]] [[TMP16]]
+//
 poly64x1x3_t test_vld3_lane_p64(poly64_t  *a, poly64x1x3_t b) {
   return vld3_lane_p64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.uint8x16x4_t @test_vld4q_lane_u8(i8* %a, [4 x <16 x i8>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x4_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
-// CHECK:   [[VLD4_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i64 15, i8* %a)
-// CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
-// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP7]]
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint8x16x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP9:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP8]], i8* align 16 [[TMP9]], i64 64, i1 false)
-// CHECK:   [[TMP10:%.*]] = load %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.uint8x16x4_t [[TMP10]]
+// CHECK-LABEL: @test_vld4q_lane_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT8X16X4_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X16X4_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X16X4_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT8X16X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], %struct.uint8x16x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <16 x i8>] [[B_COERCE:%.*]], [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.uint8x16x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i64 15, i8* [[A:%.*]])
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP7]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast %struct.uint8x16x4_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP8]], i8* align 16 [[TMP9]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP10:%.*]] = load [[STRUCT_UINT8X16X4_T]], %struct.uint8x16x4_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_UINT8X16X4_T]] [[TMP10]]
+//
 uint8x16x4_t test_vld4q_lane_u8(uint8_t  *a, uint8x16x4_t b) {
   return vld4q_lane_u8(a, b, 15);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.uint16x8x4_t @test_vld4q_lane_u16(i16* %a, [4 x <8 x i16>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x4_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
-// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
-// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i8(<8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i64 7, i8* [[TMP3]])
-// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
-// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP16]]
-// CHECK:   [[TMP17:%.*]] = bitcast %struct.uint16x8x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP18:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
-// CHECK:   [[TMP19:%.*]] = load %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.uint16x8x4_t [[TMP19]]
+// CHECK-LABEL: @test_vld4q_lane_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT16X8X4_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X8X4_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X8X4_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT16X8X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT16X8X4_T]], %struct.uint16x8x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <8 x i16>] [[B_COERCE:%.*]], [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT16X8X4_T]], %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT16X8X4_T]], %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT16X8X4_T]], %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_UINT16X8X4_T]], %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i8(<8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i64 7, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP16]], align 16
+// CHECK-NEXT:    [[TMP17:%.*]] = bitcast %struct.uint16x8x4_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP18:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP19:%.*]] = load [[STRUCT_UINT16X8X4_T]], %struct.uint16x8x4_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_UINT16X8X4_T]] [[TMP19]]
+//
 uint16x8x4_t test_vld4q_lane_u16(uint16_t  *a, uint16x8x4_t b) {
   return vld4q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.uint32x4x4_t @test_vld4q_lane_u32(i32* %a, [4 x <4 x i32>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x4_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
-// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32>
-// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i8(<4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i64 3, i8* [[TMP3]])
-// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
-// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP16]]
-// CHECK:   [[TMP17:%.*]] = bitcast %struct.uint32x4x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP18:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
-// CHECK:   [[TMP19:%.*]] = load %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.uint32x4x4_t [[TMP19]]
+// CHECK-LABEL: @test_vld4q_lane_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT32X4X4_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X4X4_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X4X4_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT32X4X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT32X4X4_T]], %struct.uint32x4x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <4 x i32>] [[B_COERCE:%.*]], [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT32X4X4_T]], %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT32X4X4_T]], %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT32X4X4_T]], %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_UINT32X4X4_T]], %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
+// CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32>
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i8(<4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i64 3, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP16]], align 16
+// CHECK-NEXT:    [[TMP17:%.*]] = bitcast %struct.uint32x4x4_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP18:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP19:%.*]] = load [[STRUCT_UINT32X4X4_T]], %struct.uint32x4x4_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_UINT32X4X4_T]] [[TMP19]]
+//
 uint32x4x4_t test_vld4q_lane_u32(uint32_t  *a, uint32x4x4_t b) {
   return vld4q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.uint64x2x4_t @test_vld4q_lane_u64(i64* %a, [4 x <2 x i64>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x4_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP11:%.*]] = bitcast <2 x i64> [[TMP10]] to <16 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
-// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <2 x i64>
-// CHECK:   [[VLD4_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0i8(<2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], <2 x i64> [[TMP15]], i64 1, i8* [[TMP3]])
-// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
-// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP16]]
-// CHECK:   [[TMP17:%.*]] = bitcast %struct.uint64x2x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP18:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
-// CHECK:   [[TMP19:%.*]] = load %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.uint64x2x4_t [[TMP19]]
+// CHECK-LABEL: @test_vld4q_lane_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT64X2X4_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT64X2X4_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT64X2X4_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT64X2X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT64X2X4_T]], %struct.uint64x2x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <2 x i64>] [[B_COERCE:%.*]], [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.uint64x2x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT64X2X4_T]], %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT64X2X4_T]], %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT64X2X4_T]], %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_UINT64X2X4_T]], %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <2 x i64> [[TMP10]] to <16 x i8>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
+// CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <2 x i64>
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0i8(<2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], <2 x i64> [[TMP15]], i64 1, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP16]], align 16
+// CHECK-NEXT:    [[TMP17:%.*]] = bitcast %struct.uint64x2x4_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP18:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP19:%.*]] = load [[STRUCT_UINT64X2X4_T]], %struct.uint64x2x4_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_UINT64X2X4_T]] [[TMP19]]
+//
 uint64x2x4_t test_vld4q_lane_u64(uint64_t  *a, uint64x2x4_t b) {
   return vld4q_lane_u64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.int8x16x4_t @test_vld4q_lane_s8(i8* %a, [4 x <16 x i8>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x4_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.int8x16x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
-// CHECK:   [[VLD4_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i64 15, i8* %a)
-// CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
-// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP7]]
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.int8x16x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP9:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP8]], i8* align 16 [[TMP9]], i64 64, i1 false)
-// CHECK:   [[TMP10:%.*]] = load %struct.int8x16x4_t, %struct.int8x16x4_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.int8x16x4_t [[TMP10]]
+// CHECK-LABEL: @test_vld4q_lane_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT8X16X4_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X16X4_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X16X4_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT8X16X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], %struct.int8x16x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <16 x i8>] [[B_COERCE:%.*]], [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.int8x16x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i64 15, i8* [[A:%.*]])
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP7]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast %struct.int8x16x4_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP8]], i8* align 16 [[TMP9]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP10:%.*]] = load [[STRUCT_INT8X16X4_T]], %struct.int8x16x4_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_INT8X16X4_T]] [[TMP10]]
+//
 int8x16x4_t test_vld4q_lane_s8(int8_t  *a, int8x16x4_t b) {
   return vld4q_lane_s8(a, b, 15);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.int16x8x4_t @test_vld4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x4_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
-// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
-// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i8(<8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i64 7, i8* [[TMP3]])
-// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
-// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP16]]
-// CHECK:   [[TMP17:%.*]] = bitcast %struct.int16x8x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP18:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
-// CHECK:   [[TMP19:%.*]] = load %struct.int16x8x4_t, %struct.int16x8x4_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.int16x8x4_t [[TMP19]]
+// CHECK-LABEL: @test_vld4q_lane_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT16X8X4_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X8X4_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X8X4_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT16X8X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X4_T]], %struct.int16x8x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <8 x i16>] [[B_COERCE:%.*]], [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X4_T]], %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X4_T]], %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X4_T]], %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X4_T]], %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i8(<8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i64 7, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP16]], align 16
+// CHECK-NEXT:    [[TMP17:%.*]] = bitcast %struct.int16x8x4_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP18:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP19:%.*]] = load [[STRUCT_INT16X8X4_T]], %struct.int16x8x4_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_INT16X8X4_T]] [[TMP19]]
+//
 int16x8x4_t test_vld4q_lane_s16(int16_t  *a, int16x8x4_t b) {
   return vld4q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.int32x4x4_t @test_vld4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x4_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
-// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32>
-// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i8(<4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i64 3, i8* [[TMP3]])
-// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
-// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP16]]
-// CHECK:   [[TMP17:%.*]] = bitcast %struct.int32x4x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP18:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
-// CHECK:   [[TMP19:%.*]] = load %struct.int32x4x4_t, %struct.int32x4x4_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.int32x4x4_t [[TMP19]]
+// CHECK-LABEL: @test_vld4q_lane_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT32X4X4_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X4X4_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X4X4_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT32X4X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT32X4X4_T]], %struct.int32x4x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <4 x i32>] [[B_COERCE:%.*]], [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT32X4X4_T]], %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT32X4X4_T]], %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT32X4X4_T]], %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_INT32X4X4_T]], %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
+// CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32>
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i8(<4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i64 3, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK-NEXT:    store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP16]], align 16
+// CHECK-NEXT:    [[TMP17:%.*]] = bitcast %struct.int32x4x4_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP18:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP19:%.*]] = load [[STRUCT_INT32X4X4_T]], %struct.int32x4x4_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_INT32X4X4_T]] [[TMP19]]
+//
 int32x4x4_t test_vld4q_lane_s32(int32_t  *a, int32x4x4_t b) {
   return vld4q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.int64x2x4_t @test_vld4q_lane_s64(i64* %a, [4 x <2 x i64>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x4_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.int64x2x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP11:%.*]] = bitcast <2 x i64> [[TMP10]] to <16 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
-// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <2 x i64>
-// CHECK:   [[VLD4_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0i8(<2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], <2 x i64> [[TMP15]], i64 1, i8* [[TMP3]])
-// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
-// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP16]]
-// CHECK:   [[TMP17:%.*]] = bitcast %struct.int64x2x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP18:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
-// CHECK:   [[TMP19:%.*]] = load %struct.int64x2x4_t, %struct.int64x2x4_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.int64x2x4_t [[TMP19]]
+// CHECK-LABEL: @test_vld4q_lane_s64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT64X2X4_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT64X2X4_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT64X2X4_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT64X2X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT64X2X4_T]], %struct.int64x2x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <2 x i64>] [[B_COERCE:%.*]], [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.int64x2x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT64X2X4_T]], %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT64X2X4_T]], %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT64X2X4_T]], %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_INT64X2X4_T]], %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <2 x i64> [[TMP10]] to <16 x i8>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
+// CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <2 x i64>
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0i8(<2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], <2 x i64> [[TMP15]], i64 1, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP16]], align 16
+// CHECK-NEXT:    [[TMP17:%.*]] = bitcast %struct.int64x2x4_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP18:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP19:%.*]] = load [[STRUCT_INT64X2X4_T]], %struct.int64x2x4_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_INT64X2X4_T]] [[TMP19]]
+//
 int64x2x4_t test_vld4q_lane_s64(int64_t  *a, int64x2x4_t b) {
   return vld4q_lane_s64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.float16x8x4_t @test_vld4q_lane_f16(half* %a, [4 x <8 x half>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x4_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <8 x half>] [[B]].coerce, [4 x <8 x half>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP10:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x half> [[TMP10]] to <16 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x half>
-// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x half>
-// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4lane.v8f16.p0i8(<8 x half> [[TMP12]], <8 x half> [[TMP13]], <8 x half> [[TMP14]], <8 x half> [[TMP15]], i64 7, i8* [[TMP3]])
-// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <8 x half>, <8 x half>, <8 x half>, <8 x half> }*
-// CHECK:   store { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4_LANE]], { <8 x half>, <8 x half>, <8 x half>, <8 x half> }* [[TMP16]]
-// CHECK:   [[TMP17:%.*]] = bitcast %struct.float16x8x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP18:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
-// CHECK:   [[TMP19:%.*]] = load %struct.float16x8x4_t, %struct.float16x8x4_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.float16x8x4_t [[TMP19]]
+// CHECK-LABEL: @test_vld4q_lane_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X8X4_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X8X4_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X8X4_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X8X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X4_T]], %struct.float16x8x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <8 x half>] [[B_COERCE:%.*]], [4 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast half* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X4_T]], %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X4_T]], %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X4_T]], %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X4_T]], %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP10:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x half> [[TMP10]] to <16 x i8>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x half>
+// CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x half>
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4lane.v8f16.p0i8(<8 x half> [[TMP12]], <8 x half> [[TMP13]], <8 x half> [[TMP14]], <8 x half> [[TMP15]], i64 7, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <8 x half>, <8 x half>, <8 x half>, <8 x half> }*
+// CHECK-NEXT:    store { <8 x half>, <8 x half>, <8 x half>, <8 x half> } [[VLD4_LANE]], { <8 x half>, <8 x half>, <8 x half>, <8 x half> }* [[TMP16]], align 16
+// CHECK-NEXT:    [[TMP17:%.*]] = bitcast %struct.float16x8x4_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP18:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP19:%.*]] = load [[STRUCT_FLOAT16X8X4_T]], %struct.float16x8x4_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_FLOAT16X8X4_T]] [[TMP19]]
+//
 float16x8x4_t test_vld4q_lane_f16(float16_t  *a, float16x8x4_t b) {
   return vld4q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.float32x4x4_t @test_vld4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x4_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <4 x float>] [[B]].coerce, [4 x <4 x float>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP10:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP11:%.*]] = bitcast <4 x float> [[TMP10]] to <16 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
-// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x float>
-// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4lane.v4f32.p0i8(<4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x float> [[TMP15]], i64 3, i8* [[TMP3]])
-// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x float>, <4 x float>, <4 x float>, <4 x float> }*
-// CHECK:   store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4_LANE]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP16]]
-// CHECK:   [[TMP17:%.*]] = bitcast %struct.float32x4x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP18:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
-// CHECK:   [[TMP19:%.*]] = load %struct.float32x4x4_t, %struct.float32x4x4_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.float32x4x4_t [[TMP19]]
+// CHECK-LABEL: @test_vld4q_lane_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT32X4X4_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X4X4_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X4X4_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X4X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X4X4_T]], %struct.float32x4x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <4 x float>] [[B_COERCE:%.*]], [4 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X4X4_T]], %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X4X4_T]], %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X4X4_T]], %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X4X4_T]], %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP10:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x float> [[TMP10]] to <16 x i8>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
+// CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x float>
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4lane.v4f32.p0i8(<4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x float> [[TMP15]], i64 3, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x float>, <4 x float>, <4 x float>, <4 x float> }*
+// CHECK-NEXT:    store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4_LANE]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP16]], align 16
+// CHECK-NEXT:    [[TMP17:%.*]] = bitcast %struct.float32x4x4_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP18:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP19:%.*]] = load [[STRUCT_FLOAT32X4X4_T]], %struct.float32x4x4_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_FLOAT32X4X4_T]] [[TMP19]]
+//
 float32x4x4_t test_vld4q_lane_f32(float32_t  *a, float32x4x4_t b) {
   return vld4q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.float64x2x4_t @test_vld4q_lane_f64(double* %a, [4 x <2 x double>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x4_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.float64x2x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <2 x double>] [[B]].coerce, [4 x <2 x double>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast double* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x double> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x double> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <2 x double> [[TMP8]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP10:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP11:%.*]] = bitcast <2 x double> [[TMP10]] to <16 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x double>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x double>
-// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <2 x double>
-// CHECK:   [[VLD4_LANE:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4lane.v2f64.p0i8(<2 x double> [[TMP12]], <2 x double> [[TMP13]], <2 x double> [[TMP14]], <2 x double> [[TMP15]], i64 1, i8* [[TMP3]])
-// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x double>, <2 x double>, <2 x double>, <2 x double> }*
-// CHECK:   store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4_LANE]], { <2 x double>, <2 x double>, <2 x double>, <2 x double> }* [[TMP16]]
-// CHECK:   [[TMP17:%.*]] = bitcast %struct.float64x2x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP18:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
-// CHECK:   [[TMP19:%.*]] = load %struct.float64x2x4_t, %struct.float64x2x4_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.float64x2x4_t [[TMP19]]
+// CHECK-LABEL: @test_vld4q_lane_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X4_T]], %struct.float64x2x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <2 x double>] [[B_COERCE:%.*]], [4 x <2 x double>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.float64x2x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast double* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X4_T]], %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x double> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X4_T]], %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x double> [[TMP6]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X4_T]], %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP8:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x double> [[TMP8]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X4_T]], %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP10:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <2 x double> [[TMP10]] to <16 x i8>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x double>
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x double>
+// CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <2 x double>
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4lane.v2f64.p0i8(<2 x double> [[TMP12]], <2 x double> [[TMP13]], <2 x double> [[TMP14]], <2 x double> [[TMP15]], i64 1, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x double>, <2 x double>, <2 x double>, <2 x double> }*
+// CHECK-NEXT:    store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4_LANE]], { <2 x double>, <2 x double>, <2 x double>, <2 x double> }* [[TMP16]], align 16
+// CHECK-NEXT:    [[TMP17:%.*]] = bitcast %struct.float64x2x4_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP18:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP19:%.*]] = load [[STRUCT_FLOAT64X2X4_T]], %struct.float64x2x4_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_FLOAT64X2X4_T]] [[TMP19]]
+//
 float64x2x4_t test_vld4q_lane_f64(float64_t  *a, float64x2x4_t b) {
   return vld4q_lane_f64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.poly8x16x4_t @test_vld4q_lane_p8(i8* %a, [4 x <16 x i8>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x4_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
-// CHECK:   [[VLD4_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i64 15, i8* %a)
-// CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
-// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP7]]
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly8x16x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP9:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP8]], i8* align 16 [[TMP9]], i64 64, i1 false)
-// CHECK:   [[TMP10:%.*]] = load %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.poly8x16x4_t [[TMP10]]
+// CHECK-LABEL: @test_vld4q_lane_p8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY8X16X4_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X16X4_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X16X4_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY8X16X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], %struct.poly8x16x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <16 x i8>] [[B_COERCE:%.*]], [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.poly8x16x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i64 15, i8* [[A:%.*]])
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK-NEXT:    store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP7]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast %struct.poly8x16x4_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP8]], i8* align 16 [[TMP9]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP10:%.*]] = load [[STRUCT_POLY8X16X4_T]], %struct.poly8x16x4_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_POLY8X16X4_T]] [[TMP10]]
+//
 poly8x16x4_t test_vld4q_lane_p8(poly8_t  *a, poly8x16x4_t b) {
   return vld4q_lane_p8(a, b, 15);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.poly16x8x4_t @test_vld4q_lane_p16(i16* %a, [4 x <8 x i16>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x4_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
-// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
-// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i8(<8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i64 7, i8* [[TMP3]])
-// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
-// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP16]]
-// CHECK:   [[TMP17:%.*]] = bitcast %struct.poly16x8x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP18:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
-// CHECK:   [[TMP19:%.*]] = load %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.poly16x8x4_t [[TMP19]]
+// CHECK-LABEL: @test_vld4q_lane_p16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY16X8X4_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X8X4_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X8X4_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY16X8X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY16X8X4_T]], %struct.poly16x8x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <8 x i16>] [[B_COERCE:%.*]], [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY16X8X4_T]], %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY16X8X4_T]], %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_POLY16X8X4_T]], %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_POLY16X8X4_T]], %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i8(<8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i64 7, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK-NEXT:    store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP16]], align 16
+// CHECK-NEXT:    [[TMP17:%.*]] = bitcast %struct.poly16x8x4_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP18:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP19:%.*]] = load [[STRUCT_POLY16X8X4_T]], %struct.poly16x8x4_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_POLY16X8X4_T]] [[TMP19]]
+//
 poly16x8x4_t test_vld4q_lane_p16(poly16_t  *a, poly16x8x4_t b) {
   return vld4q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.poly64x2x4_t @test_vld4q_lane_p64(i64* %a, [4 x <2 x i64>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x4_t, align 16
-// CHECK:   [[B:%.*]] = alloca %struct.poly64x2x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP11:%.*]] = bitcast <2 x i64> [[TMP10]] to <16 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
-// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <2 x i64>
-// CHECK:   [[VLD4_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0i8(<2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], <2 x i64> [[TMP15]], i64 1, i8* [[TMP3]])
-// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
-// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP16]]
-// CHECK:   [[TMP17:%.*]] = bitcast %struct.poly64x2x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP18:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
-// CHECK:   [[TMP19:%.*]] = load %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[RETVAL]], align 16
-// CHECK:   ret %struct.poly64x2x4_t [[TMP19]]
+// CHECK-LABEL: @test_vld4q_lane_p64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X2X4_T:%.*]], align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY64X2X4_T]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY64X2X4_T]], align 16
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY64X2X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY64X2X4_T]], %struct.poly64x2x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <2 x i64>] [[B_COERCE:%.*]], [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.poly64x2x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY64X2X4_T]], %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY64X2X4_T]], %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_POLY64X2X4_T]], %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_POLY64X2X4_T]], %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <2 x i64> [[TMP10]] to <16 x i8>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
+// CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <2 x i64>
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0i8(<2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], <2 x i64> [[TMP15]], i64 1, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK-NEXT:    store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP16]], align 16
+// CHECK-NEXT:    [[TMP17:%.*]] = bitcast %struct.poly64x2x4_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP18:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP17]], i8* align 16 [[TMP18]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP19:%.*]] = load [[STRUCT_POLY64X2X4_T]], %struct.poly64x2x4_t* [[RETVAL]], align 16
+// CHECK-NEXT:    ret [[STRUCT_POLY64X2X4_T]] [[TMP19]]
+//
 poly64x2x4_t test_vld4q_lane_p64(poly64_t  *a, poly64x2x4_t b) {
   return vld4q_lane_p64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.uint8x8x4_t @test_vld4_lane_u8(i8* %a, [4 x <8 x i8>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x4_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
-// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i64 7, i8* %a)
-// CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
-// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint8x8x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP9:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP8]], i8* align 8 [[TMP9]], i64 32, i1 false)
-// CHECK:   [[TMP10:%.*]] = load %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.uint8x8x4_t [[TMP10]]
+// CHECK-LABEL: @test_vld4_lane_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT8X8X4_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X8X4_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X8X4_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT8X8X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X4_T]], %struct.uint8x8x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <8 x i8>] [[B_COERCE:%.*]], [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X4_T]], %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X4_T]], %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X4_T]], %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X4_T]], %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i64 7, i8* [[A:%.*]])
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast %struct.uint8x8x4_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP8]], i8* align 8 [[TMP9]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP10:%.*]] = load [[STRUCT_UINT8X8X4_T]], %struct.uint8x8x4_t* [[RETVAL]], align 8
+// CHECK-NEXT:    ret [[STRUCT_UINT8X8X4_T]] [[TMP10]]
+//
 uint8x8x4_t test_vld4_lane_u8(uint8_t  *a, uint8x8x4_t b) {
   return vld4_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.uint16x4x4_t @test_vld4_lane_u16(i16* %a, [4 x <4 x i16>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x4_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
-// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
-// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i8(<4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i64 3, i8* [[TMP3]])
-// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP16]]
-// CHECK:   [[TMP17:%.*]] = bitcast %struct.uint16x4x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP18:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
-// CHECK:   [[TMP19:%.*]] = load %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.uint16x4x4_t [[TMP19]]
+// CHECK-LABEL: @test_vld4_lane_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT16X4X4_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X4X4_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X4X4_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT16X4X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT16X4X4_T]], %struct.uint16x4x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <4 x i16>] [[B_COERCE:%.*]], [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT16X4X4_T]], %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT16X4X4_T]], %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT16X4X4_T]], %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_UINT16X4X4_T]], %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i8(<4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i64 3, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP16]], align 8
+// CHECK-NEXT:    [[TMP17:%.*]] = bitcast %struct.uint16x4x4_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP18:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP19:%.*]] = load [[STRUCT_UINT16X4X4_T]], %struct.uint16x4x4_t* [[RETVAL]], align 8
+// CHECK-NEXT:    ret [[STRUCT_UINT16X4X4_T]] [[TMP19]]
+//
 uint16x4x4_t test_vld4_lane_u16(uint16_t  *a, uint16x4x4_t b) {
   return vld4_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.uint32x2x4_t @test_vld4_lane_u32(i32* %a, [4 x <2 x i32>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x4_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
-// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32>
-// CHECK:   [[VLD4_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0i8(<2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i64 1, i8* [[TMP3]])
-// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
-// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP16]]
-// CHECK:   [[TMP17:%.*]] = bitcast %struct.uint32x2x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP18:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
-// CHECK:   [[TMP19:%.*]] = load %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.uint32x2x4_t [[TMP19]]
+// CHECK-LABEL: @test_vld4_lane_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT32X2X4_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X2X4_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X2X4_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT32X2X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT32X2X4_T]], %struct.uint32x2x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <2 x i32>] [[B_COERCE:%.*]], [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT32X2X4_T]], %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT32X2X4_T]], %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT32X2X4_T]], %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_UINT32X2X4_T]], %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
+// CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32>
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0i8(<2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i64 1, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP16]], align 8
+// CHECK-NEXT:    [[TMP17:%.*]] = bitcast %struct.uint32x2x4_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP18:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP19:%.*]] = load [[STRUCT_UINT32X2X4_T]], %struct.uint32x2x4_t* [[RETVAL]], align 8
+// CHECK-NEXT:    ret [[STRUCT_UINT32X2X4_T]] [[TMP19]]
+//
 uint32x2x4_t test_vld4_lane_u32(uint32_t  *a, uint32x2x4_t b) {
   return vld4_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.uint64x1x4_t @test_vld4_lane_u64(i64* %a, [4 x <1 x i64>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x4_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP10:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
-// CHECK:   [[VLD4_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0i8(<1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], <1 x i64> [[TMP15]], i64 0, i8* [[TMP3]])
-// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
-// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP16]]
-// CHECK:   [[TMP17:%.*]] = bitcast %struct.uint64x1x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP18:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
-// CHECK:   [[TMP19:%.*]] = load %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.uint64x1x4_t [[TMP19]]
+// CHECK-LABEL: @test_vld4_lane_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_UINT64X1X4_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT64X1X4_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT64X1X4_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_UINT64X1X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT64X1X4_T]], %struct.uint64x1x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <1 x i64>] [[B_COERCE:%.*]], [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.uint64x1x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT64X1X4_T]], %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT64X1X4_T]], %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT64X1X4_T]], %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_UINT64X1X4_T]], %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
+// CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0i8(<1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], <1 x i64> [[TMP15]], i64 0, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP16]], align 8
+// CHECK-NEXT:    [[TMP17:%.*]] = bitcast %struct.uint64x1x4_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP18:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP19:%.*]] = load [[STRUCT_UINT64X1X4_T]], %struct.uint64x1x4_t* [[RETVAL]], align 8
+// CHECK-NEXT:    ret [[STRUCT_UINT64X1X4_T]] [[TMP19]]
+//
 uint64x1x4_t test_vld4_lane_u64(uint64_t  *a, uint64x1x4_t b) {
   return vld4_lane_u64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.int8x8x4_t @test_vld4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x4_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
-// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i64 7, i8* %a)
-// CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
-// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.int8x8x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP9:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP8]], i8* align 8 [[TMP9]], i64 32, i1 false)
-// CHECK:   [[TMP10:%.*]] = load %struct.int8x8x4_t, %struct.int8x8x4_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.int8x8x4_t [[TMP10]]
+// CHECK-LABEL: @test_vld4_lane_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT8X8X4_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X8X4_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X8X4_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT8X8X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X4_T]], %struct.int8x8x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <8 x i8>] [[B_COERCE:%.*]], [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X4_T]], %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X4_T]], %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X4_T]], %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X4_T]], %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i64 7, i8* [[A:%.*]])
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast %struct.int8x8x4_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP8]], i8* align 8 [[TMP9]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP10:%.*]] = load [[STRUCT_INT8X8X4_T]], %struct.int8x8x4_t* [[RETVAL]], align 8
+// CHECK-NEXT:    ret [[STRUCT_INT8X8X4_T]] [[TMP10]]
+//
 int8x8x4_t test_vld4_lane_s8(int8_t  *a, int8x8x4_t b) {
   return vld4_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.int16x4x4_t @test_vld4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x4_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
-// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
-// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i8(<4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i64 3, i8* [[TMP3]])
-// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP16]]
-// CHECK:   [[TMP17:%.*]] = bitcast %struct.int16x4x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP18:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
-// CHECK:   [[TMP19:%.*]] = load %struct.int16x4x4_t, %struct.int16x4x4_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.int16x4x4_t [[TMP19]]
+// CHECK-LABEL: @test_vld4_lane_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT16X4X4_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X4X4_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X4X4_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT16X4X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT16X4X4_T]], %struct.int16x4x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <4 x i16>] [[B_COERCE:%.*]], [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT16X4X4_T]], %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT16X4X4_T]], %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT16X4X4_T]], %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_INT16X4X4_T]], %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i8(<4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i64 3, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP16]], align 8
+// CHECK-NEXT:    [[TMP17:%.*]] = bitcast %struct.int16x4x4_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP18:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP19:%.*]] = load [[STRUCT_INT16X4X4_T]], %struct.int16x4x4_t* [[RETVAL]], align 8
+// CHECK-NEXT:    ret [[STRUCT_INT16X4X4_T]] [[TMP19]]
+//
 int16x4x4_t test_vld4_lane_s16(int16_t  *a, int16x4x4_t b) {
   return vld4_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.int32x2x4_t @test_vld4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x4_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
-// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32>
-// CHECK:   [[VLD4_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0i8(<2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i64 1, i8* [[TMP3]])
-// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
-// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP16]]
-// CHECK:   [[TMP17:%.*]] = bitcast %struct.int32x2x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP18:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
-// CHECK:   [[TMP19:%.*]] = load %struct.int32x2x4_t, %struct.int32x2x4_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.int32x2x4_t [[TMP19]]
+// CHECK-LABEL: @test_vld4_lane_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT32X2X4_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X2X4_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X2X4_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT32X2X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT32X2X4_T]], %struct.int32x2x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <2 x i32>] [[B_COERCE:%.*]], [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT32X2X4_T]], %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT32X2X4_T]], %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT32X2X4_T]], %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_INT32X2X4_T]], %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
+// CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32>
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0i8(<2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i64 1, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK-NEXT:    store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP16]], align 8
+// CHECK-NEXT:    [[TMP17:%.*]] = bitcast %struct.int32x2x4_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP18:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP19:%.*]] = load [[STRUCT_INT32X2X4_T]], %struct.int32x2x4_t* [[RETVAL]], align 8
+// CHECK-NEXT:    ret [[STRUCT_INT32X2X4_T]] [[TMP19]]
+//
 int32x2x4_t test_vld4_lane_s32(int32_t  *a, int32x2x4_t b) {
   return vld4_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.int64x1x4_t @test_vld4_lane_s64(i64* %a, [4 x <1 x i64>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x4_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.int64x1x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP10:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
-// CHECK:   [[VLD4_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0i8(<1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], <1 x i64> [[TMP15]], i64 0, i8* [[TMP3]])
-// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
-// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP16]]
-// CHECK:   [[TMP17:%.*]] = bitcast %struct.int64x1x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP18:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
-// CHECK:   [[TMP19:%.*]] = load %struct.int64x1x4_t, %struct.int64x1x4_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.int64x1x4_t [[TMP19]]
+// CHECK-LABEL: @test_vld4_lane_s64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_INT64X1X4_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT64X1X4_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT64X1X4_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_INT64X1X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT64X1X4_T]], %struct.int64x1x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <1 x i64>] [[B_COERCE:%.*]], [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.int64x1x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT64X1X4_T]], %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT64X1X4_T]], %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT64X1X4_T]], %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_INT64X1X4_T]], %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
+// CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0i8(<1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], <1 x i64> [[TMP15]], i64 0, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP16]], align 8
+// CHECK-NEXT:    [[TMP17:%.*]] = bitcast %struct.int64x1x4_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP18:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP19:%.*]] = load [[STRUCT_INT64X1X4_T]], %struct.int64x1x4_t* [[RETVAL]], align 8
+// CHECK-NEXT:    ret [[STRUCT_INT64X1X4_T]] [[TMP19]]
+//
 int64x1x4_t test_vld4_lane_s64(int64_t  *a, int64x1x4_t b) {
   return vld4_lane_s64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.float16x4x4_t @test_vld4_lane_f16(half* %a, [4 x <4 x half>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x4_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <4 x half>] [[B]].coerce, [4 x <4 x half>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP10:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP11:%.*]] = bitcast <4 x half> [[TMP10]] to <8 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x half>
-// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x half>
-// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4lane.v4f16.p0i8(<4 x half> [[TMP12]], <4 x half> [[TMP13]], <4 x half> [[TMP14]], <4 x half> [[TMP15]], i64 3, i8* [[TMP3]])
-// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x half>, <4 x half>, <4 x half>, <4 x half> }*
-// CHECK:   store { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4_LANE]], { <4 x half>, <4 x half>, <4 x half>, <4 x half> }* [[TMP16]]
-// CHECK:   [[TMP17:%.*]] = bitcast %struct.float16x4x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP18:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
-// CHECK:   [[TMP19:%.*]] = load %struct.float16x4x4_t, %struct.float16x4x4_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.float16x4x4_t [[TMP19]]
+// CHECK-LABEL: @test_vld4_lane_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT16X4X4_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X4X4_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X4X4_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT16X4X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X4_T]], %struct.float16x4x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <4 x half>] [[B_COERCE:%.*]], [4 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast half* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X4_T]], %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X4_T]], %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X4_T]], %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X4_T]], %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP10:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x half> [[TMP10]] to <8 x i8>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x half>
+// CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x half>
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4lane.v4f16.p0i8(<4 x half> [[TMP12]], <4 x half> [[TMP13]], <4 x half> [[TMP14]], <4 x half> [[TMP15]], i64 3, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x half>, <4 x half>, <4 x half>, <4 x half> }*
+// CHECK-NEXT:    store { <4 x half>, <4 x half>, <4 x half>, <4 x half> } [[VLD4_LANE]], { <4 x half>, <4 x half>, <4 x half>, <4 x half> }* [[TMP16]], align 8
+// CHECK-NEXT:    [[TMP17:%.*]] = bitcast %struct.float16x4x4_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP18:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP19:%.*]] = load [[STRUCT_FLOAT16X4X4_T]], %struct.float16x4x4_t* [[RETVAL]], align 8
+// CHECK-NEXT:    ret [[STRUCT_FLOAT16X4X4_T]] [[TMP19]]
+//
 float16x4x4_t test_vld4_lane_f16(float16_t  *a, float16x4x4_t b) {
   return vld4_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.float32x2x4_t @test_vld4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x4_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <2 x float>] [[B]].coerce, [4 x <2 x float>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP10:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP11:%.*]] = bitcast <2 x float> [[TMP10]] to <8 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
-// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x float>
-// CHECK:   [[VLD4_LANE:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4lane.v2f32.p0i8(<2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], <2 x float> [[TMP15]], i64 1, i8* [[TMP3]])
-// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }*
-// CHECK:   store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_LANE]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP16]]
-// CHECK:   [[TMP17:%.*]] = bitcast %struct.float32x2x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP18:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
-// CHECK:   [[TMP19:%.*]] = load %struct.float32x2x4_t, %struct.float32x2x4_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.float32x2x4_t [[TMP19]]
+// CHECK-LABEL: @test_vld4_lane_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT32X2X4_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X2X4_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X2X4_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT32X2X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X2X4_T]], %struct.float32x2x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <2 x float>] [[B_COERCE:%.*]], [4 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X2X4_T]], %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X2X4_T]], %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X2X4_T]], %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP8:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X2X4_T]], %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP10:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <2 x float> [[TMP10]] to <8 x i8>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
+// CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x float>
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4lane.v2f32.p0i8(<2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], <2 x float> [[TMP15]], i64 1, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }*
+// CHECK-NEXT:    store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_LANE]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP16]], align 8
+// CHECK-NEXT:    [[TMP17:%.*]] = bitcast %struct.float32x2x4_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP18:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP19:%.*]] = load [[STRUCT_FLOAT32X2X4_T]], %struct.float32x2x4_t* [[RETVAL]], align 8
+// CHECK-NEXT:    ret [[STRUCT_FLOAT32X2X4_T]] [[TMP19]]
+//
 float32x2x4_t test_vld4_lane_f32(float32_t  *a, float32x2x4_t b) {
   return vld4_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.float64x1x4_t @test_vld4_lane_f64(double* %a, [4 x <1 x double>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x4_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.float64x1x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <1 x double>] [[B]].coerce, [4 x <1 x double>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast double* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <1 x double> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <1 x double> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <1 x double> [[TMP8]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP10:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP11:%.*]] = bitcast <1 x double> [[TMP10]] to <8 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x double>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x double>
-// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x double>
-// CHECK:   [[VLD4_LANE:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4lane.v1f64.p0i8(<1 x double> [[TMP12]], <1 x double> [[TMP13]], <1 x double> [[TMP14]], <1 x double> [[TMP15]], i64 0, i8* [[TMP3]])
-// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <1 x double>, <1 x double>, <1 x double>, <1 x double> }*
-// CHECK:   store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4_LANE]], { <1 x double>, <1 x double>, <1 x double>, <1 x double> }* [[TMP16]]
-// CHECK:   [[TMP17:%.*]] = bitcast %struct.float64x1x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP18:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
-// CHECK:   [[TMP19:%.*]] = load %struct.float64x1x4_t, %struct.float64x1x4_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.float64x1x4_t [[TMP19]]
+// CHECK-LABEL: @test_vld4_lane_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X4_T]], %struct.float64x1x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <1 x double>] [[B_COERCE:%.*]], [4 x <1 x double>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.float64x1x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast double* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X4_T]], %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x double> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X4_T]], %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x double> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X4_T]], %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP8:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x double> [[TMP8]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X4_T]], %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP10:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x double> [[TMP10]] to <8 x i8>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double>
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x double>
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x double>
+// CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x double>
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4lane.v1f64.p0i8(<1 x double> [[TMP12]], <1 x double> [[TMP13]], <1 x double> [[TMP14]], <1 x double> [[TMP15]], i64 0, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <1 x double>, <1 x double>, <1 x double>, <1 x double> }*
+// CHECK-NEXT:    store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4_LANE]], { <1 x double>, <1 x double>, <1 x double>, <1 x double> }* [[TMP16]], align 8
+// CHECK-NEXT:    [[TMP17:%.*]] = bitcast %struct.float64x1x4_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP18:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP19:%.*]] = load [[STRUCT_FLOAT64X1X4_T]], %struct.float64x1x4_t* [[RETVAL]], align 8
+// CHECK-NEXT:    ret [[STRUCT_FLOAT64X1X4_T]] [[TMP19]]
+//
 float64x1x4_t test_vld4_lane_f64(float64_t  *a, float64x1x4_t b) {
   return vld4_lane_f64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.poly8x8x4_t @test_vld4_lane_p8(i8* %a, [4 x <8 x i8>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x4_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
-// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i64 7, i8* %a)
-// CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
-// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
-// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly8x8x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP9:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP8]], i8* align 8 [[TMP9]], i64 32, i1 false)
-// CHECK:   [[TMP10:%.*]] = load %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.poly8x8x4_t [[TMP10]]
+// CHECK-LABEL: @test_vld4_lane_p8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY8X8X4_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X8X4_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X8X4_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY8X8X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X4_T]], %struct.poly8x8x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <8 x i8>] [[B_COERCE:%.*]], [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X4_T]], %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X4_T]], %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X4_T]], %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X4_T]], %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i64 7, i8* [[A:%.*]])
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK-NEXT:    store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast %struct.poly8x8x4_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP8]], i8* align 8 [[TMP9]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP10:%.*]] = load [[STRUCT_POLY8X8X4_T]], %struct.poly8x8x4_t* [[RETVAL]], align 8
+// CHECK-NEXT:    ret [[STRUCT_POLY8X8X4_T]] [[TMP10]]
+//
 poly8x8x4_t test_vld4_lane_p8(poly8_t  *a, poly8x8x4_t b) {
   return vld4_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.poly16x4x4_t @test_vld4_lane_p16(i16* %a, [4 x <4 x i16>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x4_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
-// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
-// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i8(<4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i64 3, i8* [[TMP3]])
-// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
-// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP16]]
-// CHECK:   [[TMP17:%.*]] = bitcast %struct.poly16x4x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP18:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
-// CHECK:   [[TMP19:%.*]] = load %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.poly16x4x4_t [[TMP19]]
+// CHECK-LABEL: @test_vld4_lane_p16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY16X4X4_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X4X4_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X4X4_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY16X4X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY16X4X4_T]], %struct.poly16x4x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <4 x i16>] [[B_COERCE:%.*]], [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY16X4X4_T]], %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY16X4X4_T]], %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_POLY16X4X4_T]], %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_POLY16X4X4_T]], %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i8(<4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i64 3, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK-NEXT:    store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP16]], align 8
+// CHECK-NEXT:    [[TMP17:%.*]] = bitcast %struct.poly16x4x4_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP18:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP19:%.*]] = load [[STRUCT_POLY16X4X4_T]], %struct.poly16x4x4_t* [[RETVAL]], align 8
+// CHECK-NEXT:    ret [[STRUCT_POLY16X4X4_T]] [[TMP19]]
+//
 poly16x4x4_t test_vld4_lane_p16(poly16_t  *a, poly16x4x4_t b) {
   return vld4_lane_p16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} %struct.poly64x1x4_t @test_vld4_lane_p64(i64* %a, [4 x <1 x i64>] %b.coerce) #2 {
-// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x4_t, align 8
-// CHECK:   [[B:%.*]] = alloca %struct.poly64x1x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x4_t, align 8
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
-// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP10:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
-// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
-// CHECK:   [[VLD4_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0i8(<1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], <1 x i64> [[TMP15]], i64 0, i8* [[TMP3]])
-// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
-// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP16]]
-// CHECK:   [[TMP17:%.*]] = bitcast %struct.poly64x1x4_t* [[RETVAL]] to i8*
-// CHECK:   [[TMP18:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
-// CHECK:   [[TMP19:%.*]] = load %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[RETVAL]], align 8
-// CHECK:   ret %struct.poly64x1x4_t [[TMP19]]
+// CHECK-LABEL: @test_vld4_lane_p64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_POLY64X1X4_T:%.*]], align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY64X1X4_T]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY64X1X4_T]], align 8
+// CHECK-NEXT:    [[__RET:%.*]] = alloca [[STRUCT_POLY64X1X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY64X1X4_T]], %struct.poly64x1x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <1 x i64>] [[B_COERCE:%.*]], [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.poly64x1x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY64X1X4_T]], %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY64X1X4_T]], %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_POLY64X1X4_T]], %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_POLY64X1X4_T]], %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP10:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
+// CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+// CHECK-NEXT:    [[VLD4_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0i8(<1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], <1 x i64> [[TMP15]], i64 0, i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK-NEXT:    store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP16]], align 8
+// CHECK-NEXT:    [[TMP17:%.*]] = bitcast %struct.poly64x1x4_t* [[RETVAL]] to i8*
+// CHECK-NEXT:    [[TMP18:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP17]], i8* align 8 [[TMP18]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP19:%.*]] = load [[STRUCT_POLY64X1X4_T]], %struct.poly64x1x4_t* [[RETVAL]], align 8
+// CHECK-NEXT:    ret [[STRUCT_POLY64X1X4_T]] [[TMP19]]
+//
 poly64x1x4_t test_vld4_lane_p64(poly64_t  *a, poly64x1x4_t b) {
   return vld4_lane_p64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_u8(i8* %a, <16 x i8> %b) #0 {
-// CHECK:   [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
-// CHECK:   store i8 [[TMP0]], i8* %a
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst1q_lane_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <16 x i8> [[B:%.*]], i32 15
+// CHECK-NEXT:    store i8 [[TMP0]], i8* [[A:%.*]], align 1
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_lane_u8(uint8_t  *a, uint8x16_t b) {
   vst1q_lane_u8(a, b, 15);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_u16(i16* %a, <8 x i16> %b) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
-// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
-// CHECK:   store i16 [[TMP3]], i16* [[TMP4]]
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst1q_lane_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK-NEXT:    store i16 [[TMP3]], i16* [[TMP4]], align 2
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_lane_u16(uint16_t  *a, uint16x8_t b) {
   vst1q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_u32(i32* %a, <4 x i32> %b) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
-// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32*
-// CHECK:   store i32 [[TMP3]], i32* [[TMP4]]
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst1q_lane_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK-NEXT:    store i32 [[TMP3]], i32* [[TMP4]], align 4
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_lane_u32(uint32_t  *a, uint32x4_t b) {
   vst1q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_u64(i64* %a, <2 x i64> %b) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64*
-// CHECK:   store i64 [[TMP3]], i64* [[TMP4]]
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst1q_lane_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK-NEXT:    store i64 [[TMP3]], i64* [[TMP4]], align 8
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_lane_u64(uint64_t  *a, uint64x2_t b) {
   vst1q_lane_u64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_s8(i8* %a, <16 x i8> %b) #0 {
-// CHECK:   [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
-// CHECK:   store i8 [[TMP0]], i8* %a
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst1q_lane_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <16 x i8> [[B:%.*]], i32 15
+// CHECK-NEXT:    store i8 [[TMP0]], i8* [[A:%.*]], align 1
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_lane_s8(int8_t  *a, int8x16_t b) {
   vst1q_lane_s8(a, b, 15);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_s16(i16* %a, <8 x i16> %b) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
-// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
-// CHECK:   store i16 [[TMP3]], i16* [[TMP4]]
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst1q_lane_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK-NEXT:    store i16 [[TMP3]], i16* [[TMP4]], align 2
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_lane_s16(int16_t  *a, int16x8_t b) {
   vst1q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_s32(i32* %a, <4 x i32> %b) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
-// CHECK:   [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
-// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32*
-// CHECK:   store i32 [[TMP3]], i32* [[TMP4]]
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst1q_lane_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK-NEXT:    store i32 [[TMP3]], i32* [[TMP4]], align 4
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_lane_s32(int32_t  *a, int32x4_t b) {
   vst1q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_s64(i64* %a, <2 x i64> %b) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64*
-// CHECK:   store i64 [[TMP3]], i64* [[TMP4]]
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst1q_lane_s64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK-NEXT:    store i64 [[TMP3]], i64* [[TMP4]], align 8
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_lane_s64(int64_t  *a, int64x2_t b) {
   vst1q_lane_s64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_f16(half* %a, <8 x half> %b) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
-// CHECK:   [[TMP3:%.*]] = extractelement <8 x half> [[TMP2]], i32 7
-// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to half*
-// CHECK:   store half [[TMP3]], half* [[TMP4]]
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst1q_lane_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast half* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
+// CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x half> [[TMP2]], i32 7
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP0]] to half*
+// CHECK-NEXT:    store half [[TMP3]], half* [[TMP4]], align 2
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_lane_f16(float16_t  *a, float16x8_t b) {
   vst1q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_f32(float* %a, <4 x float> %b) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
-// CHECK:   [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
-// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to float*
-// CHECK:   store float [[TMP3]], float* [[TMP4]]
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst1q_lane_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP0]] to float*
+// CHECK-NEXT:    store float [[TMP3]], float* [[TMP4]], align 4
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_lane_f32(float32_t  *a, float32x4_t b) {
   vst1q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_f64(double* %a, <2 x double> %b) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast double* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
-// CHECK:   [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
-// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to double*
-// CHECK:   store double [[TMP3]], double* [[TMP4]]
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst1q_lane_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[B:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP0]] to double*
+// CHECK-NEXT:    store double [[TMP3]], double* [[TMP4]], align 8
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_lane_f64(float64_t  *a, float64x2_t b) {
   vst1q_lane_f64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_p8(i8* %a, <16 x i8> %b) #0 {
-// CHECK:   [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
-// CHECK:   store i8 [[TMP0]], i8* %a
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst1q_lane_p8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <16 x i8> [[B:%.*]], i32 15
+// CHECK-NEXT:    store i8 [[TMP0]], i8* [[A:%.*]], align 1
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_lane_p8(poly8_t  *a, poly8x16_t b) {
   vst1q_lane_p8(a, b, 15);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_p16(i16* %a, <8 x i16> %b) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
-// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
-// CHECK:   store i16 [[TMP3]], i16* [[TMP4]]
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst1q_lane_p16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[B:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK-NEXT:    store i16 [[TMP3]], i16* [[TMP4]], align 2
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_lane_p16(poly16_t  *a, poly16x8_t b) {
   vst1q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1q_lane_p64(i64* %a, <2 x i64> %b) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
-// CHECK:   [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64*
-// CHECK:   store i64 [[TMP3]], i64* [[TMP4]]
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst1q_lane_p64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[B:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK-NEXT:    store i64 [[TMP3]], i64* [[TMP4]], align 8
+// CHECK-NEXT:    ret void
+//
 void test_vst1q_lane_p64(poly64_t  *a, poly64x2_t b) {
   vst1q_lane_p64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1_lane_u8(i8* %a, <8 x i8> %b) #1 {
-// CHECK:   [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
-// CHECK:   store i8 [[TMP0]], i8* %a
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst1_lane_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <8 x i8> [[B:%.*]], i32 7
+// CHECK-NEXT:    store i8 [[TMP0]], i8* [[A:%.*]], align 1
+// CHECK-NEXT:    ret void
+//
 void test_vst1_lane_u8(uint8_t  *a, uint8x8_t b) {
   vst1_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1_lane_u16(i16* %a, <4 x i16> %b) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
-// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
-// CHECK:   store i16 [[TMP3]], i16* [[TMP4]]
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst1_lane_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK-NEXT:    store i16 [[TMP3]], i16* [[TMP4]], align 2
+// CHECK-NEXT:    ret void
+//
 void test_vst1_lane_u16(uint16_t  *a, uint16x4_t b) {
   vst1_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1_lane_u32(i32* %a, <2 x i32> %b) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
-// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32*
-// CHECK:   store i32 [[TMP3]], i32* [[TMP4]]
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst1_lane_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK-NEXT:    store i32 [[TMP3]], i32* [[TMP4]], align 4
+// CHECK-NEXT:    ret void
+//
 void test_vst1_lane_u32(uint32_t  *a, uint32x2_t b) {
   vst1_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1_lane_u64(i64* %a, <1 x i64> %b) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
-// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64*
-// CHECK:   store i64 [[TMP3]], i64* [[TMP4]]
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst1_lane_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK-NEXT:    store i64 [[TMP3]], i64* [[TMP4]], align 8
+// CHECK-NEXT:    ret void
+//
 void test_vst1_lane_u64(uint64_t  *a, uint64x1_t b) {
   vst1_lane_u64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1_lane_s8(i8* %a, <8 x i8> %b) #1 {
-// CHECK:   [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
-// CHECK:   store i8 [[TMP0]], i8* %a
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst1_lane_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <8 x i8> [[B:%.*]], i32 7
+// CHECK-NEXT:    store i8 [[TMP0]], i8* [[A:%.*]], align 1
+// CHECK-NEXT:    ret void
+//
 void test_vst1_lane_s8(int8_t  *a, int8x8_t b) {
   vst1_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1_lane_s16(i16* %a, <4 x i16> %b) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
-// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
-// CHECK:   store i16 [[TMP3]], i16* [[TMP4]]
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst1_lane_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK-NEXT:    store i16 [[TMP3]], i16* [[TMP4]], align 2
+// CHECK-NEXT:    ret void
+//
 void test_vst1_lane_s16(int16_t  *a, int16x4_t b) {
   vst1_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1_lane_s32(i32* %a, <2 x i32> %b) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
-// CHECK:   [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
-// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32*
-// CHECK:   store i32 [[TMP3]], i32* [[TMP4]]
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst1_lane_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK-NEXT:    store i32 [[TMP3]], i32* [[TMP4]], align 4
+// CHECK-NEXT:    ret void
+//
 void test_vst1_lane_s32(int32_t  *a, int32x2_t b) {
   vst1_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1_lane_s64(i64* %a, <1 x i64> %b) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
-// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64*
-// CHECK:   store i64 [[TMP3]], i64* [[TMP4]]
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst1_lane_s64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK-NEXT:    store i64 [[TMP3]], i64* [[TMP4]], align 8
+// CHECK-NEXT:    ret void
+//
 void test_vst1_lane_s64(int64_t  *a, int64x1_t b) {
   vst1_lane_s64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1_lane_f16(half* %a, <4 x half> %b) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
-// CHECK:   [[TMP3:%.*]] = extractelement <4 x half> [[TMP2]], i32 3
-// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to half*
-// CHECK:   store half [[TMP3]], half* [[TMP4]]
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst1_lane_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast half* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
+// CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x half> [[TMP2]], i32 3
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP0]] to half*
+// CHECK-NEXT:    store half [[TMP3]], half* [[TMP4]], align 2
+// CHECK-NEXT:    ret void
+//
 void test_vst1_lane_f16(float16_t  *a, float16x4_t b) {
   vst1_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1_lane_f32(float* %a, <2 x float> %b) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
-// CHECK:   [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
-// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to float*
-// CHECK:   store float [[TMP3]], float* [[TMP4]]
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst1_lane_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP0]] to float*
+// CHECK-NEXT:    store float [[TMP3]], float* [[TMP4]], align 4
+// CHECK-NEXT:    ret void
+//
 void test_vst1_lane_f32(float32_t  *a, float32x2_t b) {
   vst1_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1_lane_f64(double* %a, <1 x double> %b) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast double* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
-// CHECK:   [[TMP3:%.*]] = extractelement <1 x double> [[TMP2]], i32 0
-// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to double*
-// CHECK:   store double [[TMP3]], double* [[TMP4]]
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst1_lane_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x double> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK-NEXT:    [[TMP3:%.*]] = extractelement <1 x double> [[TMP2]], i32 0
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP0]] to double*
+// CHECK-NEXT:    store double [[TMP3]], double* [[TMP4]], align 8
+// CHECK-NEXT:    ret void
+//
 void test_vst1_lane_f64(float64_t  *a, float64x1_t b) {
   vst1_lane_f64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1_lane_p8(i8* %a, <8 x i8> %b) #1 {
-// CHECK:   [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
-// CHECK:   store i8 [[TMP0]], i8* %a
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst1_lane_p8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = extractelement <8 x i8> [[B:%.*]], i32 7
+// CHECK-NEXT:    store i8 [[TMP0]], i8* [[A:%.*]], align 1
+// CHECK-NEXT:    ret void
+//
 void test_vst1_lane_p8(poly8_t  *a, poly8x8_t b) {
   vst1_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1_lane_p16(i16* %a, <4 x i16> %b) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
-// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
-// CHECK:   store i16 [[TMP3]], i16* [[TMP4]]
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst1_lane_p16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK-NEXT:    store i16 [[TMP3]], i16* [[TMP4]], align 2
+// CHECK-NEXT:    ret void
+//
 void test_vst1_lane_p16(poly16_t  *a, poly16x4_t b) {
   vst1_lane_p16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst1_lane_p64(i64* %a, <1 x i64> %b) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
-// CHECK:   [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
-// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64*
-// CHECK:   store i64 [[TMP3]], i64* [[TMP4]]
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst1_lane_p64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK-NEXT:    [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK-NEXT:    store i64 [[TMP3]], i64* [[TMP4]], align 8
+// CHECK-NEXT:    ret void
+//
 void test_vst1_lane_p64(poly64_t  *a, poly64x1_t b) {
   vst1_lane_p64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_u8(i8* %a, [2 x <16 x i8>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 15, i8* %a)
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst2q_lane_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X16X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X16X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X2_T]], %struct.uint8x16x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <16 x i8>] [[B_COERCE:%.*]], [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.uint8x16x2_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X2_T]], %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X2_T]], %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 15, i8* [[A:%.*]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_lane_u8(uint8_t  *a, uint8x16x2_t b) {
   vst2q_lane_u8(a, b, 15);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_u16(i16* %a, [2 x <8 x i16>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst2q_lane_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X8X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X8X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT16X8X2_T]], %struct.uint16x8x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <8 x i16>] [[B_COERCE:%.*]], [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT16X8X2_T]], %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT16X8X2_T]], %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_lane_u16(uint16_t  *a, uint16x8x2_t b) {
   vst2q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_u32(i32* %a, [2 x <4 x i32>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v4i32.p0i8(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i64 3, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst2q_lane_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X4X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X4X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT32X4X2_T]], %struct.uint32x4x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <4 x i32>] [[B_COERCE:%.*]], [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT32X4X2_T]], %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT32X4X2_T]], %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v4i32.p0i8(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i64 3, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_lane_u32(uint32_t  *a, uint32x4x2_t b) {
   vst2q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_u64(i64* %a, [2 x <2 x i64>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v2i64.p0i8(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64 1, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst2q_lane_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT64X2X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT64X2X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT64X2X2_T]], %struct.uint64x2x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <2 x i64>] [[B_COERCE:%.*]], [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.uint64x2x2_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT64X2X2_T]], %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT64X2X2_T]], %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v2i64.p0i8(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64 1, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_lane_u64(uint64_t  *a, uint64x2x2_t b) {
   vst2q_lane_u64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_s8(i8* %a, [2 x <16 x i8>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 15, i8* %a)
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst2q_lane_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X16X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X16X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X2_T]], %struct.int8x16x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <16 x i8>] [[B_COERCE:%.*]], [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.int8x16x2_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X2_T]], %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X2_T]], %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 15, i8* [[A:%.*]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_lane_s8(int8_t  *a, int8x16x2_t b) {
   vst2q_lane_s8(a, b, 15);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst2q_lane_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X8X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X8X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X2_T]], %struct.int16x8x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <8 x i16>] [[B_COERCE:%.*]], [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X2_T]], %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X2_T]], %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_lane_s16(int16_t  *a, int16x8x2_t b) {
   vst2q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v4i32.p0i8(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i64 3, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst2q_lane_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X4X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X4X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT32X4X2_T]], %struct.int32x4x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <4 x i32>] [[B_COERCE:%.*]], [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT32X4X2_T]], %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT32X4X2_T]], %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v4i32.p0i8(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i64 3, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_lane_s32(int32_t  *a, int32x4x2_t b) {
   vst2q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_s64(i64* %a, [2 x <2 x i64>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.int64x2x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v2i64.p0i8(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64 1, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst2q_lane_s64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT64X2X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT64X2X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT64X2X2_T]], %struct.int64x2x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <2 x i64>] [[B_COERCE:%.*]], [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.int64x2x2_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT64X2X2_T]], %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT64X2X2_T]], %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v2i64.p0i8(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64 1, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_lane_s64(int64_t  *a, int64x2x2_t b) {
   vst2q_lane_s64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_f16(half* %a, [2 x <8 x half>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <8 x half>] [[B]].coerce, [2 x <8 x half>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half>
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v8f16.p0i8(<8 x half> [[TMP7]], <8 x half> [[TMP8]], i64 7, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst2q_lane_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X8X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X2_T]], %struct.float16x8x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <8 x half>] [[B_COERCE:%.*]], [2 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast half* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X2_T]], %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X2_T]], %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v8f16.p0i8(<8 x half> [[TMP7]], <8 x half> [[TMP8]], i64 7, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_lane_f16(float16_t  *a, float16x8x2_t b) {
   vst2q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <4 x float>] [[B]].coerce, [2 x <4 x float>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v4f32.p0i8(<4 x float> [[TMP7]], <4 x float> [[TMP8]], i64 3, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst2q_lane_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X4X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X4X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X4X2_T]], %struct.float32x4x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <4 x float>] [[B_COERCE:%.*]], [2 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X4X2_T]], %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X4X2_T]], %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v4f32.p0i8(<4 x float> [[TMP7]], <4 x float> [[TMP8]], i64 3, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_lane_f32(float32_t  *a, float32x4x2_t b) {
   vst2q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_f64(double* %a, [2 x <2 x double>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.float64x2x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <2 x double>] [[B]].coerce, [2 x <2 x double>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v2f64.p0i8(<2 x double> [[TMP7]], <2 x double> [[TMP8]], i64 1, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst2q_lane_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X2X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X2_T]], %struct.float64x2x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <2 x double>] [[B_COERCE:%.*]], [2 x <2 x double>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.float64x2x2_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X2_T]], %struct.float64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X2_T]], %struct.float64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v2f64.p0i8(<2 x double> [[TMP7]], <2 x double> [[TMP8]], i64 1, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_lane_f64(float64_t  *a, float64x2x2_t b) {
   vst2q_lane_f64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_p8(i8* %a, [2 x <16 x i8>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 15, i8* %a)
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst2q_lane_p8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X16X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X16X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X2_T]], %struct.poly8x16x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <16 x i8>] [[B_COERCE:%.*]], [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.poly8x16x2_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X2_T]], %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X2_T]], %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 15, i8* [[A:%.*]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_lane_p8(poly8_t  *a, poly8x16x2_t b) {
   vst2q_lane_p8(a, b, 15);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_p16(i16* %a, [2 x <8 x i16>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst2q_lane_p16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X8X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X8X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY16X8X2_T]], %struct.poly16x8x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <8 x i16>] [[B_COERCE:%.*]], [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY16X8X2_T]], %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY16X8X2_T]], %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_lane_p16(poly16_t  *a, poly16x8x2_t b) {
   vst2q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2q_lane_p64(i64* %a, [2 x <2 x i64>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.poly64x2x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x2_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v2i64.p0i8(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64 1, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst2q_lane_p64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY64X2X2_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY64X2X2_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY64X2X2_T]], %struct.poly64x2x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <2 x i64>] [[B_COERCE:%.*]], [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.poly64x2x2_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY64X2X2_T]], %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY64X2X2_T]], %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v2i64.p0i8(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64 1, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2q_lane_p64(poly64_t  *a, poly64x2x2_t b) {
   vst2q_lane_p64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2_lane_u8(i8* %a, [2 x <8 x i8>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, i8* %a)
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst2_lane_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X8X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X8X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X2_T]], %struct.uint8x8x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <8 x i8>] [[B_COERCE:%.*]], [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X2_T]], %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X2_T]], %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, i8* [[A:%.*]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2_lane_u8(uint8_t  *a, uint8x8x2_t b) {
   vst2_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2_lane_u16(i16* %a, [2 x <4 x i16>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst2_lane_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X4X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X4X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT16X4X2_T]], %struct.uint16x4x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <4 x i16>] [[B_COERCE:%.*]], [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT16X4X2_T]], %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT16X4X2_T]], %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2_lane_u16(uint16_t  *a, uint16x4x2_t b) {
   vst2_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2_lane_u32(i32* %a, [2 x <2 x i32>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v2i32.p0i8(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i64 1, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst2_lane_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X2X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X2X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT32X2X2_T]], %struct.uint32x2x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <2 x i32>] [[B_COERCE:%.*]], [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT32X2X2_T]], %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT32X2X2_T]], %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v2i32.p0i8(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i64 1, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2_lane_u32(uint32_t  *a, uint32x2x2_t b) {
   vst2_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2_lane_u64(i64* %a, [2 x <1 x i64>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v1i64.p0i8(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64 0, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst2_lane_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT64X1X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT64X1X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT64X1X2_T]], %struct.uint64x1x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <1 x i64>] [[B_COERCE:%.*]], [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.uint64x1x2_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT64X1X2_T]], %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT64X1X2_T]], %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v1i64.p0i8(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64 0, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2_lane_u64(uint64_t  *a, uint64x1x2_t b) {
   vst2_lane_u64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, i8* %a)
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst2_lane_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X8X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X8X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X2_T]], %struct.int8x8x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <8 x i8>] [[B_COERCE:%.*]], [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X2_T]], %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X2_T]], %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, i8* [[A:%.*]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2_lane_s8(int8_t  *a, int8x8x2_t b) {
   vst2_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst2_lane_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X4X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X4X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT16X4X2_T]], %struct.int16x4x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <4 x i16>] [[B_COERCE:%.*]], [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT16X4X2_T]], %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT16X4X2_T]], %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2_lane_s16(int16_t  *a, int16x4x2_t b) {
   vst2_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v2i32.p0i8(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i64 1, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst2_lane_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X2X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X2X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT32X2X2_T]], %struct.int32x2x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <2 x i32>] [[B_COERCE:%.*]], [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT32X2X2_T]], %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT32X2X2_T]], %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v2i32.p0i8(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i64 1, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2_lane_s32(int32_t  *a, int32x2x2_t b) {
   vst2_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2_lane_s64(i64* %a, [2 x <1 x i64>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.int64x1x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v1i64.p0i8(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64 0, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst2_lane_s64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT64X1X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT64X1X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT64X1X2_T]], %struct.int64x1x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <1 x i64>] [[B_COERCE:%.*]], [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.int64x1x2_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT64X1X2_T]], %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT64X1X2_T]], %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v1i64.p0i8(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64 0, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2_lane_s64(int64_t  *a, int64x1x2_t b) {
   vst2_lane_s64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2_lane_f16(half* %a, [2 x <4 x half>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <4 x half>] [[B]].coerce, [2 x <4 x half>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half>
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v4f16.p0i8(<4 x half> [[TMP7]], <4 x half> [[TMP8]], i64 3, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst2_lane_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X4X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X2_T]], %struct.float16x4x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <4 x half>] [[B_COERCE:%.*]], [2 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast half* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X2_T]], %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X2_T]], %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v4f16.p0i8(<4 x half> [[TMP7]], <4 x half> [[TMP8]], i64 3, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2_lane_f16(float16_t  *a, float16x4x2_t b) {
   vst2_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <2 x float>] [[B]].coerce, [2 x <2 x float>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v2f32.p0i8(<2 x float> [[TMP7]], <2 x float> [[TMP8]], i64 1, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst2_lane_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X2X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X2X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X2X2_T]], %struct.float32x2x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <2 x float>] [[B_COERCE:%.*]], [2 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X2X2_T]], %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X2X2_T]], %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v2f32.p0i8(<2 x float> [[TMP7]], <2 x float> [[TMP8]], i64 1, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2_lane_f32(float32_t  *a, float32x2x2_t b) {
   vst2_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2_lane_f64(double* %a, [2 x <1 x double>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.float64x1x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <1 x double>] [[B]].coerce, [2 x <1 x double>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v1f64.p0i8(<1 x double> [[TMP7]], <1 x double> [[TMP8]], i64 0, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst2_lane_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X1X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X2_T]], %struct.float64x1x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <1 x double>] [[B_COERCE:%.*]], [2 x <1 x double>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.float64x1x2_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X2_T]], %struct.float64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X2_T]], %struct.float64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v1f64.p0i8(<1 x double> [[TMP7]], <1 x double> [[TMP8]], i64 0, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2_lane_f64(float64_t  *a, float64x1x2_t b) {
   vst2_lane_f64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2_lane_p8(i8* %a, [2 x <8 x i8>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, i8* %a)
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst2_lane_p8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X8X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X8X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X2_T]], %struct.poly8x8x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <8 x i8>] [[B_COERCE:%.*]], [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X2_T]], %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X2_T]], %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, i8* [[A:%.*]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2_lane_p8(poly8_t  *a, poly8x8x2_t b) {
   vst2_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2_lane_p16(i16* %a, [2 x <4 x i16>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst2_lane_p16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X4X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X4X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY16X4X2_T]], %struct.poly16x4x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <4 x i16>] [[B_COERCE:%.*]], [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY16X4X2_T]], %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY16X4X2_T]], %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2_lane_p16(poly16_t  *a, poly16x4x2_t b) {
   vst2_lane_p16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst2_lane_p64(i64* %a, [2 x <1 x i64>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.poly64x1x2_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x2_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[B]], i32 0, i32 0
-// CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
-// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st2lane.v1i64.p0i8(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64 0, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst2_lane_p64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY64X1X2_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY64X1X2_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY64X1X2_T]], %struct.poly64x1x2_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [2 x <1 x i64>] [[B_COERCE:%.*]], [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.poly64x1x2_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY64X1X2_T]], %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY64X1X2_T]], %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st2lane.v1i64.p0i8(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64 0, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst2_lane_p64(poly64_t  *a, poly64x1x2_t b) {
   vst2_lane_p64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_u8(i8* %a, [3 x <16 x i8>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, i8* %a)
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst3q_lane_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X16X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X16X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X3_T]], %struct.uint8x16x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <16 x i8>] [[B_COERCE:%.*]], [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.uint8x16x3_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X3_T]], %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X3_T]], %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X3_T]], %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, i8* [[A:%.*]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_lane_u8(uint8_t  *a, uint8x16x3_t b) {
   vst3q_lane_u8(a, b, 15);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_u16(i16* %a, [3 x <8 x i16>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i64 7, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst3q_lane_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X8X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X8X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT16X8X3_T]], %struct.uint16x8x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <8 x i16>] [[B_COERCE:%.*]], [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT16X8X3_T]], %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT16X8X3_T]], %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT16X8X3_T]], %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i64 7, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_lane_u16(uint16_t  *a, uint16x8x3_t b) {
   vst3q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_u32(i32* %a, [3 x <4 x i32>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v4i32.p0i8(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i64 3, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst3q_lane_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X4X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X4X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT32X4X3_T]], %struct.uint32x4x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <4 x i32>] [[B_COERCE:%.*]], [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT32X4X3_T]], %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT32X4X3_T]], %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT32X4X3_T]], %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v4i32.p0i8(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i64 3, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_lane_u32(uint32_t  *a, uint32x4x3_t b) {
   vst3q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_u64(i64* %a, [3 x <2 x i64>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v2i64.p0i8(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i64 1, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst3q_lane_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT64X2X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT64X2X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT64X2X3_T]], %struct.uint64x2x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <2 x i64>] [[B_COERCE:%.*]], [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.uint64x2x3_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT64X2X3_T]], %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT64X2X3_T]], %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT64X2X3_T]], %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v2i64.p0i8(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i64 1, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_lane_u64(uint64_t  *a, uint64x2x3_t b) {
   vst3q_lane_u64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_s8(i8* %a, [3 x <16 x i8>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.int8x16x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, i8* %a)
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst3q_lane_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X16X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X16X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X3_T]], %struct.int8x16x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <16 x i8>] [[B_COERCE:%.*]], [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.int8x16x3_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X3_T]], %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X3_T]], %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X3_T]], %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, i8* [[A:%.*]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_lane_s8(int8_t  *a, int8x16x3_t b) {
   vst3q_lane_s8(a, b, 15);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i64 7, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst3q_lane_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X8X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X8X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X3_T]], %struct.int16x8x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <8 x i16>] [[B_COERCE:%.*]], [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X3_T]], %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X3_T]], %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X3_T]], %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i64 7, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_lane_s16(int16_t  *a, int16x8x3_t b) {
   vst3q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v4i32.p0i8(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i64 3, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst3q_lane_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X4X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X4X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT32X4X3_T]], %struct.int32x4x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <4 x i32>] [[B_COERCE:%.*]], [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT32X4X3_T]], %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT32X4X3_T]], %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT32X4X3_T]], %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v4i32.p0i8(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i64 3, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_lane_s32(int32_t  *a, int32x4x3_t b) {
   vst3q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_s64(i64* %a, [3 x <2 x i64>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.int64x2x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v2i64.p0i8(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i64 1, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst3q_lane_s64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT64X2X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT64X2X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT64X2X3_T]], %struct.int64x2x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <2 x i64>] [[B_COERCE:%.*]], [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.int64x2x3_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT64X2X3_T]], %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT64X2X3_T]], %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT64X2X3_T]], %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v2i64.p0i8(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i64 1, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_lane_s64(int64_t  *a, int64x2x3_t b) {
   vst3q_lane_s64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_f16(half* %a, [3 x <8 x half>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <8 x half>] [[B]].coerce, [3 x <8 x half>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x half>
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v8f16.p0i8(<8 x half> [[TMP9]], <8 x half> [[TMP10]], <8 x half> [[TMP11]], i64 7, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst3q_lane_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X8X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X8X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X3_T]], %struct.float16x8x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <8 x half>] [[B_COERCE:%.*]], [3 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast half* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X3_T]], %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X3_T]], %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X3_T]], %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x half>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v8f16.p0i8(<8 x half> [[TMP9]], <8 x half> [[TMP10]], <8 x half> [[TMP11]], i64 7, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_lane_f16(float16_t  *a, float16x8x3_t b) {
   vst3q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <4 x float>] [[B]].coerce, [3 x <4 x float>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v4f32.p0i8(<4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x float> [[TMP11]], i64 3, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst3q_lane_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X4X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X4X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X4X3_T]], %struct.float32x4x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <4 x float>] [[B_COERCE:%.*]], [3 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X4X3_T]], %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X4X3_T]], %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X4X3_T]], %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v4f32.p0i8(<4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x float> [[TMP11]], i64 3, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_lane_f32(float32_t  *a, float32x4x3_t b) {
   vst3q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_f64(double* %a, [3 x <2 x double>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.float64x2x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <2 x double>] [[B]].coerce, [3 x <2 x double>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double>
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v2f64.p0i8(<2 x double> [[TMP9]], <2 x double> [[TMP10]], <2 x double> [[TMP11]], i64 1, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst3q_lane_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X2X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X3_T]], %struct.float64x2x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <2 x double>] [[B_COERCE:%.*]], [3 x <2 x double>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.float64x2x3_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X3_T]], %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X3_T]], %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X3_T]], %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v2f64.p0i8(<2 x double> [[TMP9]], <2 x double> [[TMP10]], <2 x double> [[TMP11]], i64 1, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_lane_f64(float64_t  *a, float64x2x3_t b) {
   vst3q_lane_f64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_p8(i8* %a, [3 x <16 x i8>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, i8* %a)
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst3q_lane_p8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X16X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X16X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X3_T]], %struct.poly8x16x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <16 x i8>] [[B_COERCE:%.*]], [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.poly8x16x3_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X3_T]], %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X3_T]], %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X3_T]], %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, i8* [[A:%.*]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_lane_p8(poly8_t  *a, poly8x16x3_t b) {
   vst3q_lane_p8(a, b, 15);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_p16(i16* %a, [3 x <8 x i16>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i64 7, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst3q_lane_p16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X8X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X8X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY16X8X3_T]], %struct.poly16x8x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <8 x i16>] [[B_COERCE:%.*]], [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY16X8X3_T]], %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY16X8X3_T]], %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_POLY16X8X3_T]], %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i64 7, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_lane_p16(poly16_t  *a, poly16x8x3_t b) {
   vst3q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3q_lane_p64(i64* %a, [3 x <2 x i64>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.poly64x2x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x3_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v2i64.p0i8(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i64 1, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst3q_lane_p64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY64X2X3_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY64X2X3_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY64X2X3_T]], %struct.poly64x2x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <2 x i64>] [[B_COERCE:%.*]], [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.poly64x2x3_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 48, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY64X2X3_T]], %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY64X2X3_T]], %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_POLY64X2X3_T]], %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v2i64.p0i8(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i64 1, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3q_lane_p64(poly64_t  *a, poly64x2x3_t b) {
   vst3q_lane_p64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3_lane_u8(i8* %a, [3 x <8 x i8>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* %a)
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst3_lane_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X8X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X8X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X3_T]], %struct.uint8x8x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <8 x i8>] [[B_COERCE:%.*]], [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X3_T]], %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X3_T]], %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X3_T]], %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* [[A:%.*]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3_lane_u8(uint8_t  *a, uint8x8x3_t b) {
   vst3_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3_lane_u16(i16* %a, [3 x <4 x i16>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i64 3, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst3_lane_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X4X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X4X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT16X4X3_T]], %struct.uint16x4x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <4 x i16>] [[B_COERCE:%.*]], [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT16X4X3_T]], %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT16X4X3_T]], %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT16X4X3_T]], %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i64 3, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3_lane_u16(uint16_t  *a, uint16x4x3_t b) {
   vst3_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3_lane_u32(i32* %a, [3 x <2 x i32>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v2i32.p0i8(<2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i64 1, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst3_lane_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X2X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X2X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT32X2X3_T]], %struct.uint32x2x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <2 x i32>] [[B_COERCE:%.*]], [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT32X2X3_T]], %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT32X2X3_T]], %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT32X2X3_T]], %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v2i32.p0i8(<2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i64 1, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3_lane_u32(uint32_t  *a, uint32x2x3_t b) {
   vst3_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3_lane_u64(i64* %a, [3 x <1 x i64>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v1i64.p0i8(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i64 0, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst3_lane_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT64X1X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT64X1X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT64X1X3_T]], %struct.uint64x1x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <1 x i64>] [[B_COERCE:%.*]], [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.uint64x1x3_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT64X1X3_T]], %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT64X1X3_T]], %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT64X1X3_T]], %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v1i64.p0i8(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i64 0, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3_lane_u64(uint64_t  *a, uint64x1x3_t b) {
   vst3_lane_u64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* %a)
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst3_lane_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X8X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X8X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X3_T]], %struct.int8x8x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <8 x i8>] [[B_COERCE:%.*]], [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X3_T]], %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X3_T]], %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X3_T]], %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* [[A:%.*]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3_lane_s8(int8_t  *a, int8x8x3_t b) {
   vst3_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i64 3, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst3_lane_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X4X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X4X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT16X4X3_T]], %struct.int16x4x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <4 x i16>] [[B_COERCE:%.*]], [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT16X4X3_T]], %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT16X4X3_T]], %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT16X4X3_T]], %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i64 3, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3_lane_s16(int16_t  *a, int16x4x3_t b) {
   vst3_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v2i32.p0i8(<2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i64 1, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst3_lane_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X2X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X2X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT32X2X3_T]], %struct.int32x2x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <2 x i32>] [[B_COERCE:%.*]], [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT32X2X3_T]], %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT32X2X3_T]], %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT32X2X3_T]], %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v2i32.p0i8(<2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i64 1, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3_lane_s32(int32_t  *a, int32x2x3_t b) {
   vst3_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3_lane_s64(i64* %a, [3 x <1 x i64>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.int64x1x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v1i64.p0i8(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i64 0, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst3_lane_s64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT64X1X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT64X1X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT64X1X3_T]], %struct.int64x1x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <1 x i64>] [[B_COERCE:%.*]], [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.int64x1x3_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT64X1X3_T]], %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT64X1X3_T]], %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT64X1X3_T]], %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v1i64.p0i8(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i64 0, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3_lane_s64(int64_t  *a, int64x1x3_t b) {
   vst3_lane_s64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3_lane_f16(half* %a, [3 x <4 x half>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <4 x half>] [[B]].coerce, [3 x <4 x half>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x half>
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v4f16.p0i8(<4 x half> [[TMP9]], <4 x half> [[TMP10]], <4 x half> [[TMP11]], i64 3, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst3_lane_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X4X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X4X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X3_T]], %struct.float16x4x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <4 x half>] [[B_COERCE:%.*]], [3 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast half* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X3_T]], %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X3_T]], %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X3_T]], %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x half>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v4f16.p0i8(<4 x half> [[TMP9]], <4 x half> [[TMP10]], <4 x half> [[TMP11]], i64 3, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3_lane_f16(float16_t  *a, float16x4x3_t b) {
   vst3_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <2 x float>] [[B]].coerce, [3 x <2 x float>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v2f32.p0i8(<2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x float> [[TMP11]], i64 1, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst3_lane_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X2X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X2X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X2X3_T]], %struct.float32x2x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <2 x float>] [[B_COERCE:%.*]], [3 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X2X3_T]], %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X2X3_T]], %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X2X3_T]], %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v2f32.p0i8(<2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x float> [[TMP11]], i64 1, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3_lane_f32(float32_t  *a, float32x2x3_t b) {
   vst3_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3_lane_f64(double* %a, [3 x <1 x double>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.float64x1x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <1 x double>] [[B]].coerce, [3 x <1 x double>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double>
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v1f64.p0i8(<1 x double> [[TMP9]], <1 x double> [[TMP10]], <1 x double> [[TMP11]], i64 0, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst3_lane_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X1X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X3_T]], %struct.float64x1x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <1 x double>] [[B_COERCE:%.*]], [3 x <1 x double>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.float64x1x3_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X3_T]], %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X3_T]], %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X3_T]], %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP7:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v1f64.p0i8(<1 x double> [[TMP9]], <1 x double> [[TMP10]], <1 x double> [[TMP11]], i64 0, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3_lane_f64(float64_t  *a, float64x1x3_t b) {
   vst3_lane_f64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3_lane_p8(i8* %a, [3 x <8 x i8>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* %a)
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst3_lane_p8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X8X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X8X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X3_T]], %struct.poly8x8x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <8 x i8>] [[B_COERCE:%.*]], [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X3_T]], %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X3_T]], %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X3_T]], %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* [[A:%.*]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3_lane_p8(poly8_t  *a, poly8x8x3_t b) {
   vst3_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3_lane_p16(i16* %a, [3 x <4 x i16>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i64 3, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst3_lane_p16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X4X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X4X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY16X4X3_T]], %struct.poly16x4x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <4 x i16>] [[B_COERCE:%.*]], [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY16X4X3_T]], %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY16X4X3_T]], %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_POLY16X4X3_T]], %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i64 3, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3_lane_p16(poly16_t  *a, poly16x4x3_t b) {
   vst3_lane_p16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst3_lane_p64(i64* %a, [3 x <1 x i64>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.poly64x1x3_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x3_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[B]], i32 0, i32 0
-// CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st3lane.v1i64.p0i8(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i64 0, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst3_lane_p64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY64X1X3_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY64X1X3_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY64X1X3_T]], %struct.poly64x1x3_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [3 x <1 x i64>] [[B_COERCE:%.*]], [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.poly64x1x3_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 24, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY64X1X3_T]], %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY64X1X3_T]], %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_POLY64X1X3_T]], %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st3lane.v1i64.p0i8(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i64 0, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst3_lane_p64(poly64_t  *a, poly64x1x3_t b) {
   vst3_lane_p64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_u8(i8* %a, [4 x <16 x i8>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, i8* %a)
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst4q_lane_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X16X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X16X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], %struct.uint8x16x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <16 x i8>] [[B_COERCE:%.*]], [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.uint8x16x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, i8* [[A:%.*]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_lane_u8(uint8_t  *a, uint8x16x4_t b) {
   vst4q_lane_u8(a, b, 15);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_u16(i16* %a, [4 x <8 x i16>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v8i16.p0i8(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i64 7, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst4q_lane_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X8X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X8X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT16X8X4_T]], %struct.uint16x8x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <8 x i16>] [[B_COERCE:%.*]], [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT16X8X4_T]], %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT16X8X4_T]], %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT16X8X4_T]], %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_UINT16X8X4_T]], %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v8i16.p0i8(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i64 7, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_lane_u16(uint16_t  *a, uint16x8x4_t b) {
   vst4q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_u32(i32* %a, [4 x <4 x i32>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v4i32.p0i8(<4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], i64 3, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst4q_lane_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X4X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X4X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT32X4X4_T]], %struct.uint32x4x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <4 x i32>] [[B_COERCE:%.*]], [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT32X4X4_T]], %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT32X4X4_T]], %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT32X4X4_T]], %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_UINT32X4X4_T]], %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v4i32.p0i8(<4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], i64 3, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_lane_u32(uint32_t  *a, uint32x4x4_t b) {
   vst4q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_u64(i64* %a, [4 x <2 x i64>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v2i64.p0i8(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i64 1, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst4q_lane_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT64X2X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT64X2X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT64X2X4_T]], %struct.uint64x2x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <2 x i64>] [[B_COERCE:%.*]], [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.uint64x2x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT64X2X4_T]], %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT64X2X4_T]], %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT64X2X4_T]], %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_UINT64X2X4_T]], %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v2i64.p0i8(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i64 1, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_lane_u64(uint64_t  *a, uint64x2x4_t b) {
   vst4q_lane_u64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_s8(i8* %a, [4 x <16 x i8>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.int8x16x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, i8* %a)
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst4q_lane_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X16X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X16X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], %struct.int8x16x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <16 x i8>] [[B_COERCE:%.*]], [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.int8x16x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, i8* [[A:%.*]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_lane_s8(int8_t  *a, int8x16x4_t b) {
   vst4q_lane_s8(a, b, 15);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v8i16.p0i8(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i64 7, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst4q_lane_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X8X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X8X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X4_T]], %struct.int16x8x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <8 x i16>] [[B_COERCE:%.*]], [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X4_T]], %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X4_T]], %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X4_T]], %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_INT16X8X4_T]], %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v8i16.p0i8(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i64 7, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_lane_s16(int16_t  *a, int16x8x4_t b) {
   vst4q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v4i32.p0i8(<4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], i64 3, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst4q_lane_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X4X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X4X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT32X4X4_T]], %struct.int32x4x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <4 x i32>] [[B_COERCE:%.*]], [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT32X4X4_T]], %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT32X4X4_T]], %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT32X4X4_T]], %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_INT32X4X4_T]], %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v4i32.p0i8(<4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], i64 3, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_lane_s32(int32_t  *a, int32x4x4_t b) {
   vst4q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_s64(i64* %a, [4 x <2 x i64>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.int64x2x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v2i64.p0i8(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i64 1, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst4q_lane_s64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT64X2X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT64X2X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT64X2X4_T]], %struct.int64x2x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <2 x i64>] [[B_COERCE:%.*]], [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.int64x2x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT64X2X4_T]], %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT64X2X4_T]], %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT64X2X4_T]], %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_INT64X2X4_T]], %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v2i64.p0i8(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i64 1, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_lane_s64(int64_t  *a, int64x2x4_t b) {
   vst4q_lane_s64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_f16(half* %a, [4 x <8 x half>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <8 x half>] [[B]].coerce, [4 x <8 x half>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x half>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x half>
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v8f16.p0i8(<8 x half> [[TMP11]], <8 x half> [[TMP12]], <8 x half> [[TMP13]], <8 x half> [[TMP14]], i64 7, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst4q_lane_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X8X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X8X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X4_T]], %struct.float16x8x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <8 x half>] [[B_COERCE:%.*]], [4 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast half* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X4_T]], %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X4_T]], %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X4_T]], %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X8X4_T]], %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP9:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x half>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half>
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x half>
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x half>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v8f16.p0i8(<8 x half> [[TMP11]], <8 x half> [[TMP12]], <8 x half> [[TMP13]], <8 x half> [[TMP14]], i64 7, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_lane_f16(float16_t  *a, float16x8x4_t b) {
   vst4q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <4 x float>] [[B]].coerce, [4 x <4 x float>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x float>
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v4f32.p0i8(<4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], i64 3, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst4q_lane_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X4X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X4X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X4X4_T]], %struct.float32x4x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <4 x float>] [[B_COERCE:%.*]], [4 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X4X4_T]], %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X4X4_T]], %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X4X4_T]], %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X4X4_T]], %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP9:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x float>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v4f32.p0i8(<4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], i64 3, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_lane_f32(float32_t  *a, float32x4x4_t b) {
   vst4q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_f64(double* %a, [4 x <2 x double>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.float64x2x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <2 x double>] [[B]].coerce, [4 x <2 x double>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <2 x double> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x double>
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v2f64.p0i8(<2 x double> [[TMP11]], <2 x double> [[TMP12]], <2 x double> [[TMP13]], <2 x double> [[TMP14]], i64 1, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst4q_lane_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X2X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X4_T]], %struct.float64x2x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <2 x double>] [[B_COERCE:%.*]], [4 x <2 x double>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.float64x2x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X4_T]], %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X4_T]], %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X4_T]], %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X2X4_T]], %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP9:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <2 x double> [[TMP9]] to <16 x i8>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double>
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x double>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v2f64.p0i8(<2 x double> [[TMP11]], <2 x double> [[TMP12]], <2 x double> [[TMP13]], <2 x double> [[TMP14]], i64 1, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_lane_f64(float64_t  *a, float64x2x4_t b) {
   vst4q_lane_f64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_p8(i8* %a, [4 x <16 x i8>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, i8* %a)
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst4q_lane_p8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X16X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X16X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], %struct.poly8x16x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <16 x i8>] [[B_COERCE:%.*]], [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.poly8x16x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, i8* [[A:%.*]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_lane_p8(poly8_t  *a, poly8x16x4_t b) {
   vst4q_lane_p8(a, b, 15);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_p16(i16* %a, [4 x <8 x i16>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v8i16.p0i8(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i64 7, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst4q_lane_p16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X8X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X8X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY16X8X4_T]], %struct.poly16x8x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <8 x i16>] [[B_COERCE:%.*]], [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY16X8X4_T]], %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY16X8X4_T]], %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_POLY16X8X4_T]], %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_POLY16X8X4_T]], %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v8i16.p0i8(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i64 7, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_lane_p16(poly16_t  *a, poly16x8x4_t b) {
   vst4q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4q_lane_p64(i64* %a, [4 x <2 x i64>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.poly64x2x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x4_t, align 16
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX6]], align 16
-// CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v2i64.p0i8(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i64 1, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst4q_lane_p64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY64X2X4_T:%.*]], align 16
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY64X2X4_T]], align 16
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY64X2X4_T]], %struct.poly64x2x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <2 x i64>] [[B_COERCE:%.*]], [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.poly64x2x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP0]], i8* align 16 [[TMP1]], i64 64, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY64X2X4_T]], %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY64X2X4_T]], %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_POLY64X2X4_T]], %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_POLY64X2X4_T]], %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX6]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v2i64.p0i8(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i64 1, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4q_lane_p64(poly64_t  *a, poly64x2x4_t b) {
   vst4q_lane_p64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4_lane_u8(i8* %a, [4 x <8 x i8>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* %a)
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst4_lane_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT8X8X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT8X8X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X4_T]], %struct.uint8x8x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <8 x i8>] [[B_COERCE:%.*]], [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X4_T]], %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X4_T]], %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X4_T]], %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_UINT8X8X4_T]], %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* [[A:%.*]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4_lane_u8(uint8_t  *a, uint8x8x4_t b) {
   vst4_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4_lane_u16(i16* %a, [4 x <4 x i16>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v4i16.p0i8(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i64 3, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst4_lane_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT16X4X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT16X4X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT16X4X4_T]], %struct.uint16x4x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <4 x i16>] [[B_COERCE:%.*]], [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT16X4X4_T]], %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT16X4X4_T]], %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT16X4X4_T]], %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_UINT16X4X4_T]], %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v4i16.p0i8(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i64 3, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4_lane_u16(uint16_t  *a, uint16x4x4_t b) {
   vst4_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4_lane_u32(i32* %a, [4 x <2 x i32>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v2i32.p0i8(<2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], i64 1, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst4_lane_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT32X2X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT32X2X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT32X2X4_T]], %struct.uint32x2x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <2 x i32>] [[B_COERCE:%.*]], [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT32X2X4_T]], %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT32X2X4_T]], %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT32X2X4_T]], %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_UINT32X2X4_T]], %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v2i32.p0i8(<2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], i64 1, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4_lane_u32(uint32_t  *a, uint32x2x4_t b) {
   vst4_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4_lane_u64(i64* %a, [4 x <1 x i64>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v1i64.p0i8(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i64 0, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst4_lane_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_UINT64X1X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_UINT64X1X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_UINT64X1X4_T]], %struct.uint64x1x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <1 x i64>] [[B_COERCE:%.*]], [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.uint64x1x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_UINT64X1X4_T]], %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_UINT64X1X4_T]], %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_UINT64X1X4_T]], %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_UINT64X1X4_T]], %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v1i64.p0i8(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i64 0, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4_lane_u64(uint64_t  *a, uint64x1x4_t b) {
   vst4_lane_u64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* %a)
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst4_lane_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT8X8X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT8X8X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X4_T]], %struct.int8x8x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <8 x i8>] [[B_COERCE:%.*]], [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X4_T]], %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X4_T]], %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X4_T]], %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_INT8X8X4_T]], %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* [[A:%.*]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4_lane_s8(int8_t  *a, int8x8x4_t b) {
   vst4_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v4i16.p0i8(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i64 3, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst4_lane_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT16X4X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT16X4X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT16X4X4_T]], %struct.int16x4x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <4 x i16>] [[B_COERCE:%.*]], [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT16X4X4_T]], %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT16X4X4_T]], %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT16X4X4_T]], %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_INT16X4X4_T]], %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v4i16.p0i8(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i64 3, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4_lane_s16(int16_t  *a, int16x4x4_t b) {
   vst4_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v2i32.p0i8(<2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], i64 1, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst4_lane_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT32X2X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT32X2X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT32X2X4_T]], %struct.int32x2x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <2 x i32>] [[B_COERCE:%.*]], [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT32X2X4_T]], %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT32X2X4_T]], %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT32X2X4_T]], %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_INT32X2X4_T]], %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v2i32.p0i8(<2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], i64 1, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4_lane_s32(int32_t  *a, int32x2x4_t b) {
   vst4_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4_lane_s64(i64* %a, [4 x <1 x i64>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.int64x1x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v1i64.p0i8(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i64 0, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst4_lane_s64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_INT64X1X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_INT64X1X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT64X1X4_T]], %struct.int64x1x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <1 x i64>] [[B_COERCE:%.*]], [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.int64x1x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_INT64X1X4_T]], %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_INT64X1X4_T]], %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_INT64X1X4_T]], %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_INT64X1X4_T]], %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v1i64.p0i8(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i64 0, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4_lane_s64(int64_t  *a, int64x1x4_t b) {
   vst4_lane_s64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4_lane_f16(half* %a, [4 x <4 x half>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <4 x half>] [[B]].coerce, [4 x <4 x half>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x half>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x half>
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v4f16.p0i8(<4 x half> [[TMP11]], <4 x half> [[TMP12]], <4 x half> [[TMP13]], <4 x half> [[TMP14]], i64 3, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst4_lane_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT16X4X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT16X4X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X4_T]], %struct.float16x4x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <4 x half>] [[B_COERCE:%.*]], [4 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast half* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X4_T]], %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X4_T]], %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X4_T]], %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_FLOAT16X4X4_T]], %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP9:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x half>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half>
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x half>
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x half>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v4f16.p0i8(<4 x half> [[TMP11]], <4 x half> [[TMP12]], <4 x half> [[TMP13]], <4 x half> [[TMP14]], i64 3, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4_lane_f16(float16_t  *a, float16x4x4_t b) {
   vst4_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <2 x float>] [[B]].coerce, [4 x <2 x float>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x float>
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v2f32.p0i8(<2 x float> [[TMP11]], <2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], i64 1, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst4_lane_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT32X2X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT32X2X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X2X4_T]], %struct.float32x2x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <2 x float>] [[B_COERCE:%.*]], [4 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast float* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X2X4_T]], %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X2X4_T]], %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X2X4_T]], %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_FLOAT32X2X4_T]], %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP9:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x float>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v2f32.p0i8(<2 x float> [[TMP11]], <2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], i64 1, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4_lane_f32(float32_t  *a, float32x2x4_t b) {
   vst4_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4_lane_f64(double* %a, [4 x <1 x double>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.float64x1x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <1 x double>] [[B]].coerce, [4 x <1 x double>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <1 x double> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x double>
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v1f64.p0i8(<1 x double> [[TMP11]], <1 x double> [[TMP12]], <1 x double> [[TMP13]], <1 x double> [[TMP14]], i64 0, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst4_lane_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_FLOAT64X1X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X4_T]], %struct.float64x1x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <1 x double>] [[B_COERCE:%.*]], [4 x <1 x double>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.float64x1x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X4_T]], %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X4_T]], %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X4_T]], %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP7:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_FLOAT64X1X4_T]], %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP9:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <1 x double> [[TMP9]] to <8 x i8>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double>
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x double>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v1f64.p0i8(<1 x double> [[TMP11]], <1 x double> [[TMP12]], <1 x double> [[TMP13]], <1 x double> [[TMP14]], i64 0, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4_lane_f64(float64_t  *a, float64x1x4_t b) {
   vst4_lane_f64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4_lane_p8(i8* %a, [4 x <8 x i8>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* %a)
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst4_lane_p8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY8X8X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY8X8X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X4_T]], %struct.poly8x8x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <8 x i8>] [[B_COERCE:%.*]], [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X4_T]], %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X4_T]], %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X4_T]], %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_POLY8X8X4_T]], %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* [[A:%.*]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4_lane_p8(poly8_t  *a, poly8x8x4_t b) {
   vst4_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4_lane_p16(i16* %a, [4 x <4 x i16>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v4i16.p0i8(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i64 3, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst4_lane_p16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY16X4X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY16X4X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY16X4X4_T]], %struct.poly16x4x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <4 x i16>] [[B_COERCE:%.*]], [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY16X4X4_T]], %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY16X4X4_T]], %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_POLY16X4X4_T]], %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_POLY16X4X4_T]], %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v4i16.p0i8(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i64 3, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4_lane_p16(poly16_t  *a, poly16x4x4_t b) {
   vst4_lane_p16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} void @test_vst4_lane_p64(i64* %a, [4 x <1 x i64>] %b.coerce) #2 {
-// CHECK:   [[B:%.*]] = alloca %struct.poly64x1x4_t, align 8
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x4_t, align 8
-// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[B]], i32 0, i32 0
-// CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
-// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__S1]] to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
-// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
-// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
-// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
-// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
-// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i64 0, i64 1
-// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
-// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
-// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], i64 0, i64 2
-// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
-// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
-// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
-// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], i64 0, i64 3
-// CHECK:   [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
-// CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
-// CHECK:   call void @llvm.aarch64.neon.st4lane.v1i64.p0i8(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i64 0, i8* [[TMP2]])
-// CHECK:   ret void
+// CHECK-LABEL: @test_vst4_lane_p64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[B:%.*]] = alloca [[STRUCT_POLY64X1X4_T:%.*]], align 8
+// CHECK-NEXT:    [[__S1:%.*]] = alloca [[STRUCT_POLY64X1X4_T]], align 8
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_POLY64X1X4_T]], %struct.poly64x1x4_t* [[B]], i32 0, i32 0
+// CHECK-NEXT:    store [4 x <1 x i64>] [[B_COERCE:%.*]], [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__S1]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast %struct.poly64x1x4_t* [[B]] to i8*
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP0]], i8* align 8 [[TMP1]], i64 32, i1 false)
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64* [[A:%.*]] to i8*
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds [[STRUCT_POLY64X1X4_T]], %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK-NEXT:    [[VAL1:%.*]] = getelementptr inbounds [[STRUCT_POLY64X1X4_T]], %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK-NEXT:    [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK-NEXT:    [[VAL3:%.*]] = getelementptr inbounds [[STRUCT_POLY64X1X4_T]], %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK-NEXT:    [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK-NEXT:    [[VAL5:%.*]] = getelementptr inbounds [[STRUCT_POLY64X1X4_T]], %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK-NEXT:    [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
+// CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
+// CHECK-NEXT:    call void @llvm.aarch64.neon.st4lane.v1i64.p0i8(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i64 0, i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
 void test_vst4_lane_p64(poly64_t  *a, poly64x1x4_t b) {
   vst4_lane_p64(a, b, 0);
 }
 
-// CHECK: attributes #0 ={{.*}}"min-legal-vector-width"="128"
-// CHECK: attributes #1 ={{.*}}"min-legal-vector-width"="64"
-// CHECK: attributes #2 ={{.*}}"min-legal-vector-width"="0"

diff  --git a/clang/test/CodeGen/aarch64-neon-scalar-x-indexed-elem.c b/clang/test/CodeGen/aarch64-neon-scalar-x-indexed-elem.c
index 367978d84af4..5c9a5c13dc2b 100644
--- a/clang/test/CodeGen/aarch64-neon-scalar-x-indexed-elem.c
+++ b/clang/test/CodeGen/aarch64-neon-scalar-x-indexed-elem.c
@@ -1,3 +1,4 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-cpu cyclone \
 // RUN: -disable-O0-optnone -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
@@ -6,414 +7,496 @@
 #include <arm_neon.h>
 
 
-// CHECK-LABEL: define{{.*}} float @test_vmuls_lane_f32(float %a, <2 x float> %b) #0 {
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x float> %b, i32 1
-// CHECK:   [[MUL:%.*]] = fmul float %a, [[VGET_LANE]]
-// CHECK:   ret float [[MUL]]
+// CHECK-LABEL: @test_vmuls_lane_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <2 x float> [[B:%.*]], i32 1
+// CHECK-NEXT:    [[MUL:%.*]] = fmul float [[A:%.*]], [[VGET_LANE]]
+// CHECK-NEXT:    ret float [[MUL]]
+//
 float32_t test_vmuls_lane_f32(float32_t a, float32x2_t b) {
   return vmuls_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} double @test_vmuld_lane_f64(double %a, <1 x double> %b) #0 {
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> %b, i32 0
-// CHECK:   [[MUL:%.*]] = fmul double %a, [[VGET_LANE]]
-// CHECK:   ret double [[MUL]]
+// CHECK-LABEL: @test_vmuld_lane_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <1 x double> [[B:%.*]], i32 0
+// CHECK-NEXT:    [[MUL:%.*]] = fmul double [[A:%.*]], [[VGET_LANE]]
+// CHECK-NEXT:    ret double [[MUL]]
+//
 float64_t test_vmuld_lane_f64(float64_t a, float64x1_t b) {
   return vmuld_lane_f64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} float @test_vmuls_laneq_f32(float %a, <4 x float> %b) #1 {
-// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x float> %b, i32 3
-// CHECK:   [[MUL:%.*]] = fmul float %a, [[VGETQ_LANE]]
-// CHECK:   ret float [[MUL]]
+// CHECK-LABEL: @test_vmuls_laneq_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <4 x float> [[B:%.*]], i32 3
+// CHECK-NEXT:    [[MUL:%.*]] = fmul float [[A:%.*]], [[VGETQ_LANE]]
+// CHECK-NEXT:    ret float [[MUL]]
+//
 float32_t test_vmuls_laneq_f32(float32_t a, float32x4_t b) {
   return vmuls_laneq_f32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} double @test_vmuld_laneq_f64(double %a, <2 x double> %b) #1 {
-// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> %b, i32 1
-// CHECK:   [[MUL:%.*]] = fmul double %a, [[VGETQ_LANE]]
-// CHECK:   ret double [[MUL]]
+// CHECK-LABEL: @test_vmuld_laneq_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[B:%.*]], i32 1
+// CHECK-NEXT:    [[MUL:%.*]] = fmul double [[A:%.*]], [[VGETQ_LANE]]
+// CHECK-NEXT:    ret double [[MUL]]
+//
 float64_t test_vmuld_laneq_f64(float64_t a, float64x2_t b) {
   return vmuld_laneq_f64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} <1 x double> @test_vmul_n_f64(<1 x double> %a, double %b) #0 {
-// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %a to double
-// CHECK:   [[TMP3:%.*]] = fmul double [[TMP2]], %b
-// CHECK:   [[TMP4:%.*]] = bitcast double [[TMP3]] to <1 x double>
-// CHECK:   ret <1 x double> [[TMP4]]
+// CHECK-LABEL: @test_vmul_n_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to double
+// CHECK-NEXT:    [[TMP1:%.*]] = fmul double [[TMP0]], [[B:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast double [[TMP1]] to <1 x double>
+// CHECK-NEXT:    ret <1 x double> [[TMP2]]
+//
 float64x1_t test_vmul_n_f64(float64x1_t a, float64_t b) {
   return vmul_n_f64(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} float @test_vmulxs_lane_f32(float %a, <2 x float> %b) #0 {
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x float> %b, i32 1
-// CHECK:   [[VMULXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmulx.f32(float %a, float [[VGET_LANE]])
-// CHECK:   ret float [[VMULXS_F32_I]]
+// CHECK-LABEL: @test_vmulxs_lane_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <2 x float> [[B:%.*]], i32 1
+// CHECK-NEXT:    [[VMULXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmulx.f32(float [[A:%.*]], float [[VGET_LANE]]) [[ATTR5:#.*]]
+// CHECK-NEXT:    ret float [[VMULXS_F32_I]]
+//
 float32_t test_vmulxs_lane_f32(float32_t a, float32x2_t b) {
   return vmulxs_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} float @test_vmulxs_laneq_f32(float %a, <4 x float> %b) #1 {
-// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x float> %b, i32 3
-// CHECK:   [[VMULXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmulx.f32(float %a, float [[VGETQ_LANE]])
-// CHECK:   ret float [[VMULXS_F32_I]]
+// CHECK-LABEL: @test_vmulxs_laneq_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <4 x float> [[B:%.*]], i32 3
+// CHECK-NEXT:    [[VMULXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmulx.f32(float [[A:%.*]], float [[VGETQ_LANE]]) [[ATTR5]]
+// CHECK-NEXT:    ret float [[VMULXS_F32_I]]
+//
 float32_t test_vmulxs_laneq_f32(float32_t a, float32x4_t b) {
   return vmulxs_laneq_f32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} double @test_vmulxd_lane_f64(double %a, <1 x double> %b) #0 {
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> %b, i32 0
-// CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double %a, double [[VGET_LANE]])
-// CHECK:   ret double [[VMULXD_F64_I]]
+// CHECK-LABEL: @test_vmulxd_lane_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <1 x double> [[B:%.*]], i32 0
+// CHECK-NEXT:    [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[A:%.*]], double [[VGET_LANE]]) [[ATTR5]]
+// CHECK-NEXT:    ret double [[VMULXD_F64_I]]
+//
 float64_t test_vmulxd_lane_f64(float64_t a, float64x1_t b) {
   return vmulxd_lane_f64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} double @test_vmulxd_laneq_f64(double %a, <2 x double> %b) #1 {
-// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> %b, i32 1
-// CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double %a, double [[VGETQ_LANE]])
-// CHECK:   ret double [[VMULXD_F64_I]]
+// CHECK-LABEL: @test_vmulxd_laneq_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[B:%.*]], i32 1
+// CHECK-NEXT:    [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[A:%.*]], double [[VGETQ_LANE]]) [[ATTR5]]
+// CHECK-NEXT:    ret double [[VMULXD_F64_I]]
+//
 float64_t test_vmulxd_laneq_f64(float64_t a, float64x2_t b) {
   return vmulxd_laneq_f64(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} <1 x double> @test_vmulx_lane_f64(<1 x double> %a, <1 x double> %b) #0 {
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> %a, i32 0
-// CHECK:   [[VGET_LANE6:%.*]] = extractelement <1 x double> %b, i32 0
-// CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE6]])
-// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> %a, double [[VMULXD_F64_I]], i32 0
-// CHECK:   ret <1 x double> [[VSET_LANE]]
+// CHECK-LABEL: @test_vmulx_lane_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <1 x double> [[A:%.*]], i32 0
+// CHECK-NEXT:    [[VGET_LANE3:%.*]] = extractelement <1 x double> [[B:%.*]], i32 0
+// CHECK-NEXT:    [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE3]]) [[ATTR5]]
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <1 x double> [[A]], double [[VMULXD_F64_I]], i32 0
+// CHECK-NEXT:    ret <1 x double> [[VSET_LANE]]
+//
 float64x1_t test_vmulx_lane_f64(float64x1_t a, float64x1_t b) {
   return vmulx_lane_f64(a, b, 0);
 }
 
 
-// CHECK-LABEL: define{{.*}} <1 x double> @test_vmulx_laneq_f64_0(<1 x double> %a, <2 x double> %b) #1 {
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> %a, i32 0
-// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> %b, i32 0
-// CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]])
-// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> %a, double [[VMULXD_F64_I]], i32 0
-// CHECK:   ret <1 x double> [[VSET_LANE]]
+// CHECK-LABEL: @test_vmulx_laneq_f64_0(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <1 x double> [[A:%.*]], i32 0
+// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[B:%.*]], i32 0
+// CHECK-NEXT:    [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]]) [[ATTR5]]
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <1 x double> [[A]], double [[VMULXD_F64_I]], i32 0
+// CHECK-NEXT:    ret <1 x double> [[VSET_LANE]]
+//
 float64x1_t test_vmulx_laneq_f64_0(float64x1_t a, float64x2_t b) {
   return vmulx_laneq_f64(a, b, 0);
 }
 
-// CHECK-LABEL: define{{.*}} <1 x double> @test_vmulx_laneq_f64_1(<1 x double> %a, <2 x double> %b) #1 {
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> %a, i32 0
-// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> %b, i32 1
-// CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]])
-// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> %a, double [[VMULXD_F64_I]], i32 0
-// CHECK:   ret <1 x double> [[VSET_LANE]]
+// CHECK-LABEL: @test_vmulx_laneq_f64_1(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <1 x double> [[A:%.*]], i32 0
+// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[B:%.*]], i32 1
+// CHECK-NEXT:    [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]]) [[ATTR5]]
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <1 x double> [[A]], double [[VMULXD_F64_I]], i32 0
+// CHECK-NEXT:    ret <1 x double> [[VSET_LANE]]
+//
 float64x1_t test_vmulx_laneq_f64_1(float64x1_t a, float64x2_t b) {
   return vmulx_laneq_f64(a, b, 1);
 }
 
 
-// CHECK-LABEL: define{{.*}} float @test_vfmas_lane_f32(float %a, float %b, <2 x float> %c) #0 {
-// CHECK:   [[EXTRACT:%.*]] = extractelement <2 x float> %c, i32 1
-// CHECK:   [[TMP2:%.*]] = call float @llvm.fma.f32(float %b, float [[EXTRACT]], float %a)
-// CHECK:   ret float [[TMP2]]
+// CHECK-LABEL: @test_vfmas_lane_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x float> [[C:%.*]], i32 1
+// CHECK-NEXT:    [[TMP0:%.*]] = call float @llvm.fma.f32(float [[B:%.*]], float [[EXTRACT]], float [[A:%.*]])
+// CHECK-NEXT:    ret float [[TMP0]]
+//
 float32_t test_vfmas_lane_f32(float32_t a, float32_t b, float32x2_t c) {
   return vfmas_lane_f32(a, b, c, 1);
 }
 
-// CHECK-LABEL: define{{.*}} double @test_vfmad_lane_f64(double %a, double %b, <1 x double> %c) #0 {
-// CHECK:   [[EXTRACT:%.*]] = extractelement <1 x double> %c, i32 0
-// CHECK:   [[TMP2:%.*]] = call double @llvm.fma.f64(double %b, double [[EXTRACT]], double %a)
-// CHECK:   ret double [[TMP2]]
+// CHECK-LABEL: @test_vfmad_lane_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <1 x double> [[C:%.*]], i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.fma.f64(double [[B:%.*]], double [[EXTRACT]], double [[A:%.*]])
+// CHECK-NEXT:    ret double [[TMP0]]
+//
 float64_t test_vfmad_lane_f64(float64_t a, float64_t b, float64x1_t c) {
   return vfmad_lane_f64(a, b, c, 0);
 }
 
-// CHECK-LABEL: define{{.*}} double @test_vfmad_laneq_f64(double %a, double %b, <2 x double> %c) #1 {
-// CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> %c, i32 1
-// CHECK:   [[TMP2:%.*]] = call double @llvm.fma.f64(double %b, double [[EXTRACT]], double %a)
-// CHECK:   ret double [[TMP2]]
+// CHECK-LABEL: @test_vfmad_laneq_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x double> [[C:%.*]], i32 1
+// CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.fma.f64(double [[B:%.*]], double [[EXTRACT]], double [[A:%.*]])
+// CHECK-NEXT:    ret double [[TMP0]]
+//
 float64_t test_vfmad_laneq_f64(float64_t a, float64_t b, float64x2_t c) {
   return vfmad_laneq_f64(a, b, c, 1);
 }
 
-// CHECK-LABEL: define{{.*}} float @test_vfmss_lane_f32(float %a, float %b, <2 x float> %c) #0 {
-// CHECK:   [[SUB:%.*]] = fneg float %b
-// CHECK:   [[EXTRACT:%.*]] = extractelement <2 x float> %c, i32 1
-// CHECK:   [[TMP2:%.*]] = call float @llvm.fma.f32(float [[SUB]], float [[EXTRACT]], float %a)
-// CHECK:   ret float [[TMP2]]
+// CHECK-LABEL: @test_vfmss_lane_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[FNEG:%.*]] = fneg float [[B:%.*]]
+// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x float> [[C:%.*]], i32 1
+// CHECK-NEXT:    [[TMP0:%.*]] = call float @llvm.fma.f32(float [[FNEG]], float [[EXTRACT]], float [[A:%.*]])
+// CHECK-NEXT:    ret float [[TMP0]]
+//
 float32_t test_vfmss_lane_f32(float32_t a, float32_t b, float32x2_t c) {
   return vfmss_lane_f32(a, b, c, 1);
 }
 
-// CHECK-LABEL: define{{.*}} <1 x double> @test_vfma_lane_f64(<1 x double> %a, <1 x double> %b, <1 x double> %v) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
-// CHECK:   [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
-// CHECK:   [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
-// CHECK:   [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// CHECK:   [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]])
-// CHECK:   ret <1 x double> [[FMLA2]]
+// CHECK-LABEL: @test_vfma_lane_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x double> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK-NEXT:    [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK-NEXT:    [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]])
+// CHECK-NEXT:    ret <1 x double> [[FMLA2]]
+//
 float64x1_t test_vfma_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
   return vfma_lane_f64(a, b, v, 0);
 }
 
-// CHECK-LABEL: define{{.*}} <1 x double> @test_vfms_lane_f64(<1 x double> %a, <1 x double> %b, <1 x double> %v) #0 {
-// CHECK:   [[SUB:%.*]] = fneg <1 x double> %b
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> [[SUB]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
-// CHECK:   [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
-// CHECK:   [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
-// CHECK:   [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
-// CHECK:   [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]])
-// CHECK:   ret <1 x double> [[FMLA2]]
+// CHECK-LABEL: @test_vfms_lane_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[FNEG:%.*]] = fneg <1 x double> [[B:%.*]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x double> [[FNEG]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
+// CHECK-NEXT:    [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK-NEXT:    [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK-NEXT:    [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK-NEXT:    [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]])
+// CHECK-NEXT:    ret <1 x double> [[FMLA2]]
+//
 float64x1_t test_vfms_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
   return vfms_lane_f64(a, b, v, 0);
 }
 
-// CHECK-LABEL: define{{.*}} <1 x double> @test_vfma_laneq_f64(<1 x double> %a, <1 x double> %b, <2 x double> %v) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to double
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to double
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
-// CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
-// CHECK:   [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]])
-// CHECK:   [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double>
-// CHECK:   ret <1 x double> [[TMP7]]
+// CHECK-LABEL: @test_vfma_laneq_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x double> [[B:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to double
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to double
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
+// CHECK-NEXT:    [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]])
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double>
+// CHECK-NEXT:    ret <1 x double> [[TMP7]]
+//
 float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
   return vfma_laneq_f64(a, b, v, 0);
 }
 
-// CHECK-LABEL: define{{.*}} <1 x double> @test_vfms_laneq_f64(<1 x double> %a, <1 x double> %b, <2 x double> %v) #1 {
-// CHECK:   [[SUB:%.*]] = fneg <1 x double> %b
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> [[SUB]] to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
-// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to double
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to double
-// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
-// CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
-// CHECK:   [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]])
-// CHECK:   [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double>
-// CHECK:   ret <1 x double> [[TMP7]]
+// CHECK-LABEL: @test_vfms_laneq_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[FNEG:%.*]] = fneg <1 x double> [[B:%.*]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[A:%.*]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x double> [[FNEG]] to <8 x i8>
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to double
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to double
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// CHECK-NEXT:    [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
+// CHECK-NEXT:    [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]])
+// CHECK-NEXT:    [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double>
+// CHECK-NEXT:    ret <1 x double> [[TMP7]]
+//
 float64x1_t test_vfms_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
   return vfms_laneq_f64(a, b, v, 0);
 }
 
-// CHECK-LABEL: define{{.*}} i32 @test_vqdmullh_lane_s16(i16 %a, <4 x i16> %b) #0 {
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> %b, i32 3
-// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
-// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i64 0
-// CHECK:   [[VQDMULLH_S16_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
-// CHECK:   [[TMP4:%.*]] = extractelement <4 x i32> [[VQDMULLH_S16_I]], i64 0
-// CHECK:   ret i32 [[TMP4]]
+// CHECK-LABEL: @test_vqdmullh_lane_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x i16> [[B:%.*]], i32 3
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[A:%.*]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i64 0
+// CHECK-NEXT:    [[VQDMULLH_S16_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) [[ATTR5]]
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[VQDMULLH_S16_I]], i64 0
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
 int32_t test_vqdmullh_lane_s16(int16_t a, int16x4_t b) {
   return vqdmullh_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} i64 @test_vqdmulls_lane_s32(i32 %a, <2 x i32> %b) #0 {
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> %b, i32 1
-// CHECK:   [[VQDMULLS_S32_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %a, i32 [[VGET_LANE]])
-// CHECK:   ret i64 [[VQDMULLS_S32_I]]
+// CHECK-LABEL: @test_vqdmulls_lane_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <2 x i32> [[B:%.*]], i32 1
+// CHECK-NEXT:    [[VQDMULLS_S32_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 [[A:%.*]], i32 [[VGET_LANE]]) [[ATTR5]]
+// CHECK-NEXT:    ret i64 [[VQDMULLS_S32_I]]
+//
 int64_t test_vqdmulls_lane_s32(int32_t a, int32x2_t b) {
   return vqdmulls_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: define{{.*}} i32 @test_vqdmullh_laneq_s16(i16 %a, <8 x i16> %b) #1 {
-// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> %b, i32 7
-// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
-// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGETQ_LANE]], i64 0
-// CHECK:   [[VQDMULLH_S16_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
-// CHECK:   [[TMP4:%.*]] = extractelement <4 x i32> [[VQDMULLH_S16_I]], i64 0
-// CHECK:   ret i32 [[TMP4]]
+// CHECK-LABEL: @test_vqdmullh_laneq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 7
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[A:%.*]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[VGETQ_LANE]], i64 0
+// CHECK-NEXT:    [[VQDMULLH_S16_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) [[ATTR5]]
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[VQDMULLH_S16_I]], i64 0
+// CHECK-NEXT:    ret i32 [[TMP2]]
+//
 int32_t test_vqdmullh_laneq_s16(int16_t a, int16x8_t b) {
   return vqdmullh_laneq_s16(a, b, 7);
 }
 
-// CHECK-LABEL: define{{.*}} i64 @test_vqdmulls_laneq_s32(i32 %a, <4 x i32> %b) #1 {
-// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> %b, i32 3
-// CHECK:   [[VQDMULLS_S32_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %a, i32 [[VGETQ_LANE]])
-// CHECK:   ret i64 [[VQDMULLS_S32_I]]
+// CHECK-LABEL: @test_vqdmulls_laneq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[B:%.*]], i32 3
+// CHECK-NEXT:    [[VQDMULLS_S32_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 [[A:%.*]], i32 [[VGETQ_LANE]]) [[ATTR5]]
+// CHECK-NEXT:    ret i64 [[VQDMULLS_S32_I]]
+//
 int64_t test_vqdmulls_laneq_s32(int32_t a, int32x4_t b) {
   return vqdmulls_laneq_s32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} i16 @test_vqdmulhh_lane_s16(i16 %a, <4 x i16> %b) #0 {
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> %b, i32 3
-// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
-// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i64 0
-// CHECK:   [[VQDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
-// CHECK:   [[TMP4:%.*]] = extractelement <4 x i16> [[VQDMULHH_S16_I]], i64 0
-// CHECK:   ret i16 [[TMP4]]
+// CHECK-LABEL: @test_vqdmulhh_lane_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x i16> [[B:%.*]], i32 3
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[A:%.*]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i64 0
+// CHECK-NEXT:    [[VQDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) [[ATTR5]]
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[VQDMULHH_S16_I]], i64 0
+// CHECK-NEXT:    ret i16 [[TMP2]]
+//
 int16_t test_vqdmulhh_lane_s16(int16_t a, int16x4_t b) {
   return vqdmulhh_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} i32 @test_vqdmulhs_lane_s32(i32 %a, <2 x i32> %b) #0 {
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> %b, i32 1
-// CHECK:   [[VQDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %a, i32 [[VGET_LANE]])
-// CHECK:   ret i32 [[VQDMULHS_S32_I]]
+// CHECK-LABEL: @test_vqdmulhs_lane_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <2 x i32> [[B:%.*]], i32 1
+// CHECK-NEXT:    [[VQDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 [[A:%.*]], i32 [[VGET_LANE]]) [[ATTR5]]
+// CHECK-NEXT:    ret i32 [[VQDMULHS_S32_I]]
+//
 int32_t test_vqdmulhs_lane_s32(int32_t a, int32x2_t b) {
   return vqdmulhs_lane_s32(a, b, 1);
 }
 
 
-// CHECK-LABEL: define{{.*}} i16 @test_vqdmulhh_laneq_s16(i16 %a, <8 x i16> %b) #1 {
-// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> %b, i32 7
-// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
-// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGETQ_LANE]], i64 0
-// CHECK:   [[VQDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
-// CHECK:   [[TMP4:%.*]] = extractelement <4 x i16> [[VQDMULHH_S16_I]], i64 0
-// CHECK:   ret i16 [[TMP4]]
+// CHECK-LABEL: @test_vqdmulhh_laneq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 7
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[A:%.*]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[VGETQ_LANE]], i64 0
+// CHECK-NEXT:    [[VQDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) [[ATTR5]]
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[VQDMULHH_S16_I]], i64 0
+// CHECK-NEXT:    ret i16 [[TMP2]]
+//
 int16_t test_vqdmulhh_laneq_s16(int16_t a, int16x8_t b) {
   return vqdmulhh_laneq_s16(a, b, 7);
 }
 
 
-// CHECK-LABEL: define{{.*}} i32 @test_vqdmulhs_laneq_s32(i32 %a, <4 x i32> %b) #1 {
-// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> %b, i32 3
-// CHECK:   [[VQDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %a, i32 [[VGETQ_LANE]])
-// CHECK:   ret i32 [[VQDMULHS_S32_I]]
+// CHECK-LABEL: @test_vqdmulhs_laneq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[B:%.*]], i32 3
+// CHECK-NEXT:    [[VQDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 [[A:%.*]], i32 [[VGETQ_LANE]]) [[ATTR5]]
+// CHECK-NEXT:    ret i32 [[VQDMULHS_S32_I]]
+//
 int32_t test_vqdmulhs_laneq_s32(int32_t a, int32x4_t b) {
   return vqdmulhs_laneq_s32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} i16 @test_vqrdmulhh_lane_s16(i16 %a, <4 x i16> %b) #0 {
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> %b, i32 3
-// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
-// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i64 0
-// CHECK:   [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
-// CHECK:   [[TMP4:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0
-// CHECK:   ret i16 [[TMP4]]
+// CHECK-LABEL: @test_vqrdmulhh_lane_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x i16> [[B:%.*]], i32 3
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[A:%.*]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i64 0
+// CHECK-NEXT:    [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) [[ATTR5]]
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0
+// CHECK-NEXT:    ret i16 [[TMP2]]
+//
 int16_t test_vqrdmulhh_lane_s16(int16_t a, int16x4_t b) {
   return vqrdmulhh_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} i32 @test_vqrdmulhs_lane_s32(i32 %a, <2 x i32> %b) #0 {
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> %b, i32 1
-// CHECK:   [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %a, i32 [[VGET_LANE]])
-// CHECK:   ret i32 [[VQRDMULHS_S32_I]]
+// CHECK-LABEL: @test_vqrdmulhs_lane_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <2 x i32> [[B:%.*]], i32 1
+// CHECK-NEXT:    [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 [[A:%.*]], i32 [[VGET_LANE]]) [[ATTR5]]
+// CHECK-NEXT:    ret i32 [[VQRDMULHS_S32_I]]
+//
 int32_t test_vqrdmulhs_lane_s32(int32_t a, int32x2_t b) {
   return vqrdmulhs_lane_s32(a, b, 1);
 }
 
 
-// CHECK-LABEL: define{{.*}} i16 @test_vqrdmulhh_laneq_s16(i16 %a, <8 x i16> %b) #1 {
-// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> %b, i32 7
-// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
-// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGETQ_LANE]], i64 0
-// CHECK:   [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
-// CHECK:   [[TMP4:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0
-// CHECK:   ret i16 [[TMP4]]
+// CHECK-LABEL: @test_vqrdmulhh_laneq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 7
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[A:%.*]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[VGETQ_LANE]], i64 0
+// CHECK-NEXT:    [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) [[ATTR5]]
+// CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0
+// CHECK-NEXT:    ret i16 [[TMP2]]
+//
 int16_t test_vqrdmulhh_laneq_s16(int16_t a, int16x8_t b) {
   return vqrdmulhh_laneq_s16(a, b, 7);
 }
 
 
-// CHECK-LABEL: define{{.*}} i32 @test_vqrdmulhs_laneq_s32(i32 %a, <4 x i32> %b) #1 {
-// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> %b, i32 3
-// CHECK:   [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %a, i32 [[VGETQ_LANE]])
-// CHECK:   ret i32 [[VQRDMULHS_S32_I]]
+// CHECK-LABEL: @test_vqrdmulhs_laneq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[B:%.*]], i32 3
+// CHECK-NEXT:    [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 [[A:%.*]], i32 [[VGETQ_LANE]]) [[ATTR5]]
+// CHECK-NEXT:    ret i32 [[VQRDMULHS_S32_I]]
+//
 int32_t test_vqrdmulhs_laneq_s32(int32_t a, int32x4_t b) {
   return vqrdmulhs_laneq_s32(a, b, 3);
 }
 
-// CHECK-LABEL: define{{.*}} i32 @test_vqdmlalh_lane_s16(i32 %a, i16 %b, <4 x i16> %c) #0 {
-// CHECK:   [[LANE:%.*]] = extractelement <4 x i16> %c, i32 3
-// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
-// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[LANE]], i64 0
-// CHECK:   [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
-// CHECK:   [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
-// CHECK:   [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %a, i32 [[LANE0]])
-// CHECK:   ret i32 [[VQDMLXL1]]
+// CHECK-LABEL: @test_vqdmlalh_lane_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[LANE:%.*]] = extractelement <4 x i16> [[C:%.*]], i32 3
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[LANE]], i64 0
+// CHECK-NEXT:    [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// CHECK-NEXT:    [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
+// CHECK-NEXT:    [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 [[A:%.*]], i32 [[LANE0]])
+// CHECK-NEXT:    ret i32 [[VQDMLXL1]]
+//
 int32_t test_vqdmlalh_lane_s16(int32_t a, int16_t b, int16x4_t c) {
   return vqdmlalh_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: define{{.*}} i64 @test_vqdmlals_lane_s32(i64 %a, i32 %b, <2 x i32> %c) #0 {
-// CHECK:   [[LANE:%.*]] = extractelement <2 x i32> %c, i32 1
-// CHECK:   [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]])
-// CHECK:   [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %a, i64 [[VQDMLXL]])
-// CHECK:   ret i64 [[VQDMLXL1]]
+// CHECK-LABEL: @test_vqdmlals_lane_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[LANE:%.*]] = extractelement <2 x i32> [[C:%.*]], i32 1
+// CHECK-NEXT:    [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 [[B:%.*]], i32 [[LANE]])
+// CHECK-NEXT:    [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 [[A:%.*]], i64 [[VQDMLXL]])
+// CHECK-NEXT:    ret i64 [[VQDMLXL1]]
+//
 int64_t test_vqdmlals_lane_s32(int64_t a, int32_t b, int32x2_t c) {
   return vqdmlals_lane_s32(a, b, c, 1);
 }
 
-// CHECK-LABEL: define{{.*}} i32 @test_vqdmlalh_laneq_s16(i32 %a, i16 %b, <8 x i16> %c) #1 {
-// CHECK:   [[LANE:%.*]] = extractelement <8 x i16> %c, i32 7
-// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
-// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[LANE]], i64 0
-// CHECK:   [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
-// CHECK:   [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
-// CHECK:   [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %a, i32 [[LANE0]])
-// CHECK:   ret i32 [[VQDMLXL1]]
+// CHECK-LABEL: @test_vqdmlalh_laneq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[LANE:%.*]] = extractelement <8 x i16> [[C:%.*]], i32 7
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[LANE]], i64 0
+// CHECK-NEXT:    [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// CHECK-NEXT:    [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
+// CHECK-NEXT:    [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 [[A:%.*]], i32 [[LANE0]])
+// CHECK-NEXT:    ret i32 [[VQDMLXL1]]
+//
 int32_t test_vqdmlalh_laneq_s16(int32_t a, int16_t b, int16x8_t c) {
   return vqdmlalh_laneq_s16(a, b, c, 7);
 }
 
-// CHECK-LABEL: define{{.*}} i64 @test_vqdmlals_laneq_s32(i64 %a, i32 %b, <4 x i32> %c) #1 {
-// CHECK:   [[LANE:%.*]] = extractelement <4 x i32> %c, i32 3
-// CHECK:   [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]])
-// CHECK:   [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %a, i64 [[VQDMLXL]])
-// CHECK:   ret i64 [[VQDMLXL1]]
+// CHECK-LABEL: @test_vqdmlals_laneq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[LANE:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 3
+// CHECK-NEXT:    [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 [[B:%.*]], i32 [[LANE]])
+// CHECK-NEXT:    [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 [[A:%.*]], i64 [[VQDMLXL]])
+// CHECK-NEXT:    ret i64 [[VQDMLXL1]]
+//
 int64_t test_vqdmlals_laneq_s32(int64_t a, int32_t b, int32x4_t c) {
   return vqdmlals_laneq_s32(a, b, c, 3);
 }
 
-// CHECK-LABEL: define{{.*}} i32 @test_vqdmlslh_lane_s16(i32 %a, i16 %b, <4 x i16> %c) #0 {
-// CHECK:   [[LANE:%.*]] = extractelement <4 x i16> %c, i32 3
-// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
-// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[LANE]], i64 0
-// CHECK:   [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
-// CHECK:   [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
-// CHECK:   [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %a, i32 [[LANE0]])
-// CHECK:   ret i32 [[VQDMLXL1]]
+// CHECK-LABEL: @test_vqdmlslh_lane_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[LANE:%.*]] = extractelement <4 x i16> [[C:%.*]], i32 3
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[LANE]], i64 0
+// CHECK-NEXT:    [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// CHECK-NEXT:    [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
+// CHECK-NEXT:    [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 [[A:%.*]], i32 [[LANE0]])
+// CHECK-NEXT:    ret i32 [[VQDMLXL1]]
+//
 int32_t test_vqdmlslh_lane_s16(int32_t a, int16_t b, int16x4_t c) {
   return vqdmlslh_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: define{{.*}} i64 @test_vqdmlsls_lane_s32(i64 %a, i32 %b, <2 x i32> %c) #0 {
-// CHECK:   [[LANE:%.*]] = extractelement <2 x i32> %c, i32 1
-// CHECK:   [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]])
-// CHECK:   [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %a, i64 [[VQDMLXL]])
-// CHECK:   ret i64 [[VQDMLXL1]]
+// CHECK-LABEL: @test_vqdmlsls_lane_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[LANE:%.*]] = extractelement <2 x i32> [[C:%.*]], i32 1
+// CHECK-NEXT:    [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 [[B:%.*]], i32 [[LANE]])
+// CHECK-NEXT:    [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 [[A:%.*]], i64 [[VQDMLXL]])
+// CHECK-NEXT:    ret i64 [[VQDMLXL1]]
+//
 int64_t test_vqdmlsls_lane_s32(int64_t a, int32_t b, int32x2_t c) {
   return vqdmlsls_lane_s32(a, b, c, 1);
 }
 
-// CHECK-LABEL: define{{.*}} i32 @test_vqdmlslh_laneq_s16(i32 %a, i16 %b, <8 x i16> %c) #1 {
-// CHECK:   [[LANE:%.*]] = extractelement <8 x i16> %c, i32 7
-// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
-// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[LANE]], i64 0
-// CHECK:   [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
-// CHECK:   [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
-// CHECK:   [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %a, i32 [[LANE0]])
-// CHECK:   ret i32 [[VQDMLXL1]]
+// CHECK-LABEL: @test_vqdmlslh_laneq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[LANE:%.*]] = extractelement <8 x i16> [[C:%.*]], i32 7
+// CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i64 0
+// CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[LANE]], i64 0
+// CHECK-NEXT:    [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]])
+// CHECK-NEXT:    [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
+// CHECK-NEXT:    [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 [[A:%.*]], i32 [[LANE0]])
+// CHECK-NEXT:    ret i32 [[VQDMLXL1]]
+//
 int32_t test_vqdmlslh_laneq_s16(int32_t a, int16_t b, int16x8_t c) {
   return vqdmlslh_laneq_s16(a, b, c, 7);
 }
 
-// CHECK-LABEL: define{{.*}} i64 @test_vqdmlsls_laneq_s32(i64 %a, i32 %b, <4 x i32> %c) #1 {
-// CHECK:   [[LANE:%.*]] = extractelement <4 x i32> %c, i32 3
-// CHECK:   [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]])
-// CHECK:   [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %a, i64 [[VQDMLXL]])
-// CHECK:   ret i64 [[VQDMLXL1]]
+// CHECK-LABEL: @test_vqdmlsls_laneq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[LANE:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 3
+// CHECK-NEXT:    [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 [[B:%.*]], i32 [[LANE]])
+// CHECK-NEXT:    [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 [[A:%.*]], i64 [[VQDMLXL]])
+// CHECK-NEXT:    ret i64 [[VQDMLXL1]]
+//
 int64_t test_vqdmlsls_laneq_s32(int64_t a, int32_t b, int32x4_t c) {
   return vqdmlsls_laneq_s32(a, b, c, 3);
 }
 
-// CHECK-LABEL: define{{.*}} <1 x double> @test_vmulx_lane_f64_0() #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast i64 4599917171378402754 to <1 x double>
-// CHECK:   [[TMP1:%.*]] = bitcast i64 4606655882138939123 to <1 x double>
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP0]], i32 0
-// CHECK:   [[VGET_LANE7:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
-// CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE7]])
-// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP0]], double [[VMULXD_F64_I]], i32 0
-// CHECK:   ret <1 x double> [[VSET_LANE]]
+// CHECK-LABEL: @test_vmulx_lane_f64_0(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 4599917171378402754 to <1 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 4606655882138939123 to <1 x double>
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP0]], i32 0
+// CHECK-NEXT:    [[VGET_LANE8:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
+// CHECK-NEXT:    [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE8]]) [[ATTR5]]
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP0]], double [[VMULXD_F64_I]], i32 0
+// CHECK-NEXT:    ret <1 x double> [[VSET_LANE]]
+//
 float64x1_t test_vmulx_lane_f64_0() {
       float64x1_t arg1;
       float64x1_t arg2;
@@ -425,15 +508,17 @@ float64x1_t test_vmulx_lane_f64_0() {
       return result;
 }
 
-// CHECK-LABEL: define{{.*}} <1 x double> @test_vmulx_laneq_f64_2() #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast i64 4599917171378402754 to <1 x double>
-// CHECK:   [[TMP1:%.*]] = bitcast i64 4606655882138939123 to <1 x double>
-// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <1 x double> [[TMP0]], <1 x double> [[TMP1]], <2 x i32> <i32 0, i32 1>
-// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP0]], i32 0
-// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[SHUFFLE_I]], i32 1
-// CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]])
-// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP0]], double [[VMULXD_F64_I]], i32 0
-// CHECK:   ret <1 x double> [[VSET_LANE]]
+// CHECK-LABEL: @test_vmulx_laneq_f64_2(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64 4599917171378402754 to <1 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 4606655882138939123 to <1 x double>
+// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <1 x double> [[TMP0]], <1 x double> [[TMP1]], <2 x i32> <i32 0, i32 1>
+// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP0]], i32 0
+// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[SHUFFLE_I]], i32 1
+// CHECK-NEXT:    [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]]) [[ATTR5]]
+// CHECK-NEXT:    [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP0]], double [[VMULXD_F64_I]], i32 0
+// CHECK-NEXT:    ret <1 x double> [[VSET_LANE]]
+//
 float64x1_t test_vmulx_laneq_f64_2() {
       float64x1_t arg1;
       float64x1_t arg2;

diff  --git a/clang/test/CodeGen/aarch64-poly128.c b/clang/test/CodeGen/aarch64-poly128.c
index 113160f9d64b..f55e36cf2f2f 100644
--- a/clang/test/CodeGen/aarch64-poly128.c
+++ b/clang/test/CodeGen/aarch64-poly128.c
@@ -1,3 +1,4 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
 // RUN: -disable-O0-optnone -ffp-contract=fast -emit-llvm -o - %s | opt -S -mem2reg \
@@ -12,238 +13,300 @@
 
 #include <arm_neon.h>
 
-// CHECK-LABEL: define{{.*}} void @test_vstrq_p128(i128* %ptr, i128 %val) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast i128* %ptr to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i128*
-// CHECK:   store i128 %val, i128* [[TMP1]]
-// CHECK:   ret void
+// CHECK-LABEL: @test_vstrq_p128(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128* [[PTR:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i128*
+// CHECK-NEXT:    store i128 [[VAL:%.*]], i128* [[TMP1]], align 16
+// CHECK-NEXT:    ret void
+//
 void test_vstrq_p128(poly128_t * ptr, poly128_t val) {
   vstrq_p128(ptr, val);
 
 }
 
-// CHECK-LABEL: define{{.*}} i128 @test_vldrq_p128(i128* %ptr) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast i128* %ptr to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i128*
-// CHECK:   [[TMP2:%.*]] = load i128, i128* [[TMP1]]
-// CHECK:   ret i128 [[TMP2]]
+// CHECK-LABEL: @test_vldrq_p128(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128* [[PTR:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i128*
+// CHECK-NEXT:    [[TMP2:%.*]] = load i128, i128* [[TMP1]], align 16
+// CHECK-NEXT:    ret i128 [[TMP2]]
+//
 poly128_t test_vldrq_p128(poly128_t * ptr) {
   return vldrq_p128(ptr);
 
 }
 
-// CHECK-LABEL: define{{.*}} void @test_ld_st_p128(i128* %ptr) #0 {
-// CHECK:   [[TMP0:%.*]] = bitcast i128* %ptr to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i128*
-// CHECK:   [[TMP2:%.*]] = load i128, i128* [[TMP1]]
-// CHECK:   [[ADD_PTR:%.*]] = getelementptr inbounds i128, i128* %ptr, i64 1
-// CHECK:   [[TMP3:%.*]] = bitcast i128* [[ADD_PTR]] to i8*
-// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i128*
-// CHECK:   store i128 [[TMP2]], i128* [[TMP4]]
-// CHECK:   ret void
+// CHECK-LABEL: @test_ld_st_p128(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128* [[PTR:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i128*
+// CHECK-NEXT:    [[TMP2:%.*]] = load i128, i128* [[TMP1]], align 16
+// CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i128, i128* [[PTR]], i64 1
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast i128* [[ADD_PTR]] to i8*
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i128*
+// CHECK-NEXT:    store i128 [[TMP2]], i128* [[TMP4]], align 16
+// CHECK-NEXT:    ret void
+//
 void test_ld_st_p128(poly128_t * ptr) {
    vstrq_p128(ptr+1, vldrq_p128(ptr));
 
 }
 
-// CHECK-LABEL: define{{.*}} i128 @test_vmull_p64(i64 %a, i64 %b) #0 {
-// CHECK:   [[VMULL_P64_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %a, i64 %b) #3
-// CHECK:   [[VMULL_P641_I:%.*]] = bitcast <16 x i8> [[VMULL_P64_I]] to i128
-// CHECK:   ret i128 [[VMULL_P641_I]]
+// CHECK-LABEL: @test_vmull_p64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VMULL_P64_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 [[A:%.*]], i64 [[B:%.*]]) [[ATTR3:#.*]]
+// CHECK-NEXT:    [[VMULL_P641_I:%.*]] = bitcast <16 x i8> [[VMULL_P64_I]] to i128
+// CHECK-NEXT:    ret i128 [[VMULL_P641_I]]
+//
 poly128_t test_vmull_p64(poly64_t a, poly64_t b) {
   return vmull_p64(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} i128 @test_vmull_high_p64(<2 x i64> %a, <2 x i64> %b) #1 {
-// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> <i32 1>
-// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> [[SHUFFLE_I_I]] to i64
-// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <2 x i64> %b, <2 x i64> %b, <1 x i32> <i32 1>
-// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I7_I]] to i64
-// CHECK:   [[VMULL_P64_I_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 [[TMP0]], i64 [[TMP1]]) #3
-// CHECK:   [[VMULL_P641_I_I:%.*]] = bitcast <16 x i8> [[VMULL_P64_I_I]] to i128
-// CHECK:   ret i128 [[VMULL_P641_I_I]]
+// CHECK-LABEL: @test_vmull_high_p64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i64> [[A:%.*]], <2 x i64> [[A]], <1 x i32> <i32 1>
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[SHUFFLE_I_I]] to i64
+// CHECK-NEXT:    [[SHUFFLE_I7_I:%.*]] = shufflevector <2 x i64> [[B:%.*]], <2 x i64> [[B]], <1 x i32> <i32 1>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I7_I]] to i64
+// CHECK-NEXT:    [[VMULL_P64_I_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 [[TMP0]], i64 [[TMP1]]) [[ATTR3]]
+// CHECK-NEXT:    [[VMULL_P641_I_I:%.*]] = bitcast <16 x i8> [[VMULL_P64_I_I]] to i128
+// CHECK-NEXT:    ret i128 [[VMULL_P641_I_I]]
+//
 poly128_t test_vmull_high_p64(poly64x2_t a, poly64x2_t b) {
   return vmull_high_p64(a, b);
 }
 
-// CHECK-LABEL: define{{.*}} i128 @test_vreinterpretq_p128_s8(<16 x i8> %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to i128
-// CHECK:   ret i128 [[TMP0]]
+// CHECK-LABEL: @test_vreinterpretq_p128_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A:%.*]] to i128
+// CHECK-NEXT:    ret i128 [[TMP0]]
+//
 poly128_t test_vreinterpretq_p128_s8(int8x16_t a) {
   return vreinterpretq_p128_s8(a);
 }
 
-// CHECK-LABEL: define{{.*}} i128 @test_vreinterpretq_p128_s16(<8 x i16> %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to i128
-// CHECK:   ret i128 [[TMP0]]
+// CHECK-LABEL: @test_vreinterpretq_p128_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to i128
+// CHECK-NEXT:    ret i128 [[TMP0]]
+//
 poly128_t test_vreinterpretq_p128_s16(int16x8_t a) {
   return vreinterpretq_p128_s16(a);
 }
 
-// CHECK-LABEL: define{{.*}} i128 @test_vreinterpretq_p128_s32(<4 x i32> %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to i128
-// CHECK:   ret i128 [[TMP0]]
+// CHECK-LABEL: @test_vreinterpretq_p128_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to i128
+// CHECK-NEXT:    ret i128 [[TMP0]]
+//
 poly128_t test_vreinterpretq_p128_s32(int32x4_t a) {
   return vreinterpretq_p128_s32(a);
 }
 
-// CHECK-LABEL: define{{.*}} i128 @test_vreinterpretq_p128_s64(<2 x i64> %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to i128
-// CHECK:   ret i128 [[TMP0]]
+// CHECK-LABEL: @test_vreinterpretq_p128_s64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to i128
+// CHECK-NEXT:    ret i128 [[TMP0]]
+//
 poly128_t test_vreinterpretq_p128_s64(int64x2_t a) {
   return vreinterpretq_p128_s64(a);
 }
 
-// CHECK-LABEL: define{{.*}} i128 @test_vreinterpretq_p128_u8(<16 x i8> %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to i128
-// CHECK:   ret i128 [[TMP0]]
+// CHECK-LABEL: @test_vreinterpretq_p128_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A:%.*]] to i128
+// CHECK-NEXT:    ret i128 [[TMP0]]
+//
 poly128_t test_vreinterpretq_p128_u8(uint8x16_t a) {
   return vreinterpretq_p128_u8(a);
 }
 
-// CHECK-LABEL: define{{.*}} i128 @test_vreinterpretq_p128_u16(<8 x i16> %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to i128
-// CHECK:   ret i128 [[TMP0]]
+// CHECK-LABEL: @test_vreinterpretq_p128_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to i128
+// CHECK-NEXT:    ret i128 [[TMP0]]
+//
 poly128_t test_vreinterpretq_p128_u16(uint16x8_t a) {
   return vreinterpretq_p128_u16(a);
 }
 
-// CHECK-LABEL: define{{.*}} i128 @test_vreinterpretq_p128_u32(<4 x i32> %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to i128
-// CHECK:   ret i128 [[TMP0]]
+// CHECK-LABEL: @test_vreinterpretq_p128_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to i128
+// CHECK-NEXT:    ret i128 [[TMP0]]
+//
 poly128_t test_vreinterpretq_p128_u32(uint32x4_t a) {
   return vreinterpretq_p128_u32(a);
 }
 
-// CHECK-LABEL: define{{.*}} i128 @test_vreinterpretq_p128_u64(<2 x i64> %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to i128
-// CHECK:   ret i128 [[TMP0]]
+// CHECK-LABEL: @test_vreinterpretq_p128_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to i128
+// CHECK-NEXT:    ret i128 [[TMP0]]
+//
 poly128_t test_vreinterpretq_p128_u64(uint64x2_t a) {
   return vreinterpretq_p128_u64(a);
 }
 
-// CHECK-LABEL: define{{.*}} i128 @test_vreinterpretq_p128_f32(<4 x float> %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to i128
-// CHECK:   ret i128 [[TMP0]]
+// CHECK-LABEL: @test_vreinterpretq_p128_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to i128
+// CHECK-NEXT:    ret i128 [[TMP0]]
+//
 poly128_t test_vreinterpretq_p128_f32(float32x4_t a) {
   return vreinterpretq_p128_f32(a);
 }
 
-// CHECK-LABEL: define{{.*}} i128 @test_vreinterpretq_p128_f64(<2 x double> %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to i128
-// CHECK:   ret i128 [[TMP0]]
+// CHECK-LABEL: @test_vreinterpretq_p128_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to i128
+// CHECK-NEXT:    ret i128 [[TMP0]]
+//
 poly128_t test_vreinterpretq_p128_f64(float64x2_t a) {
   return vreinterpretq_p128_f64(a);
 }
 
-// CHECK-LABEL: define{{.*}} i128 @test_vreinterpretq_p128_p8(<16 x i8> %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to i128
-// CHECK:   ret i128 [[TMP0]]
+// CHECK-LABEL: @test_vreinterpretq_p128_p8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A:%.*]] to i128
+// CHECK-NEXT:    ret i128 [[TMP0]]
+//
 poly128_t test_vreinterpretq_p128_p8(poly8x16_t a) {
   return vreinterpretq_p128_p8(a);
 }
 
-// CHECK-LABEL: define{{.*}} i128 @test_vreinterpretq_p128_p16(<8 x i16> %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to i128
-// CHECK:   ret i128 [[TMP0]]
+// CHECK-LABEL: @test_vreinterpretq_p128_p16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[A:%.*]] to i128
+// CHECK-NEXT:    ret i128 [[TMP0]]
+//
 poly128_t test_vreinterpretq_p128_p16(poly16x8_t a) {
   return vreinterpretq_p128_p16(a);
 }
 
-// CHECK-LABEL: define{{.*}} i128 @test_vreinterpretq_p128_p64(<2 x i64> %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to i128
-// CHECK:   ret i128 [[TMP0]]
+// CHECK-LABEL: @test_vreinterpretq_p128_p64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to i128
+// CHECK-NEXT:    ret i128 [[TMP0]]
+//
 poly128_t test_vreinterpretq_p128_p64(poly64x2_t a) {
   return vreinterpretq_p128_p64(a);
 }
 
-// CHECK-LABEL: define{{.*}} <16 x i8> @test_vreinterpretq_s8_p128(i128 %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: @test_vreinterpretq_s8_p128(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A:%.*]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 int8x16_t test_vreinterpretq_s8_p128(poly128_t a) {
   return vreinterpretq_s8_p128(a);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i16> @test_vreinterpretq_s16_p128(i128 %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: @test_vreinterpretq_s16_p128(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A:%.*]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 int16x8_t test_vreinterpretq_s16_p128(poly128_t  a) {
   return vreinterpretq_s16_p128(a);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x i32> @test_vreinterpretq_s32_p128(i128 %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP0]]
+// CHECK-LABEL: @test_vreinterpretq_s32_p128(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A:%.*]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 int32x4_t test_vreinterpretq_s32_p128(poly128_t a) {
   return vreinterpretq_s32_p128(a);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i64> @test_vreinterpretq_s64_p128(i128 %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: @test_vreinterpretq_s64_p128(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A:%.*]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 int64x2_t test_vreinterpretq_s64_p128(poly128_t  a) {
   return vreinterpretq_s64_p128(a);
 }
 
-// CHECK-LABEL: define{{.*}} <16 x i8> @test_vreinterpretq_u8_p128(i128 %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: @test_vreinterpretq_u8_p128(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A:%.*]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 uint8x16_t test_vreinterpretq_u8_p128(poly128_t  a) {
   return vreinterpretq_u8_p128(a);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i16> @test_vreinterpretq_u16_p128(i128 %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: @test_vreinterpretq_u16_p128(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A:%.*]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 uint16x8_t test_vreinterpretq_u16_p128(poly128_t  a) {
   return vreinterpretq_u16_p128(a);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x i32> @test_vreinterpretq_u32_p128(i128 %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <4 x i32>
-// CHECK:   ret <4 x i32> [[TMP0]]
+// CHECK-LABEL: @test_vreinterpretq_u32_p128(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A:%.*]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
 uint32x4_t test_vreinterpretq_u32_p128(poly128_t  a) {
   return vreinterpretq_u32_p128(a);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i64> @test_vreinterpretq_u64_p128(i128 %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: @test_vreinterpretq_u64_p128(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A:%.*]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 uint64x2_t test_vreinterpretq_u64_p128(poly128_t  a) {
   return vreinterpretq_u64_p128(a);
 }
 
-// CHECK-LABEL: define{{.*}} <4 x float> @test_vreinterpretq_f32_p128(i128 %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <4 x float>
-// CHECK:   ret <4 x float> [[TMP0]]
+// CHECK-LABEL: @test_vreinterpretq_f32_p128(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A:%.*]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
 float32x4_t test_vreinterpretq_f32_p128(poly128_t  a) {
   return vreinterpretq_f32_p128(a);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x double> @test_vreinterpretq_f64_p128(i128 %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <2 x double>
-// CHECK:   ret <2 x double> [[TMP0]]
+// CHECK-LABEL: @test_vreinterpretq_f64_p128(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A:%.*]] to <2 x double>
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
 float64x2_t test_vreinterpretq_f64_p128(poly128_t  a) {
   return vreinterpretq_f64_p128(a);
 }
 
-// CHECK-LABEL: define{{.*}} <16 x i8> @test_vreinterpretq_p8_p128(i128 %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <16 x i8>
-// CHECK:   ret <16 x i8> [[TMP0]]
+// CHECK-LABEL: @test_vreinterpretq_p8_p128(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A:%.*]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
 poly8x16_t test_vreinterpretq_p8_p128(poly128_t  a) {
   return vreinterpretq_p8_p128(a);
 }
 
-// CHECK-LABEL: define{{.*}} <8 x i16> @test_vreinterpretq_p16_p128(i128 %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <8 x i16>
-// CHECK:   ret <8 x i16> [[TMP0]]
+// CHECK-LABEL: @test_vreinterpretq_p16_p128(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A:%.*]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
 poly16x8_t test_vreinterpretq_p16_p128(poly128_t  a) {
   return vreinterpretq_p16_p128(a);
 }
 
-// CHECK-LABEL: define{{.*}} <2 x i64> @test_vreinterpretq_p64_p128(i128 %a) #1 {
-// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <2 x i64>
-// CHECK:   ret <2 x i64> [[TMP0]]
+// CHECK-LABEL: @test_vreinterpretq_p64_p128(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[A:%.*]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
 poly64x2_t test_vreinterpretq_p64_p128(poly128_t  a) {
   return vreinterpretq_p64_p128(a);
 }

diff  --git a/clang/test/CodeGen/aarch64-poly64.c b/clang/test/CodeGen/aarch64-poly64.c
index ebc58b5840fa..1452e8aff6c2 100644
--- a/clang/test/CodeGen/aarch64-poly64.c
+++ b/clang/test/CodeGen/aarch64-poly64.c
@@ -600,4 +600,4 @@ poly64x2_t test_vsriq_n_p64(poly64x2_t a, poly64x2_t b) {
 
 // CHECK: attributes #0 ={{.*}}"min-legal-vector-width"="64"
 // CHECK: attributes #1 ={{.*}}"min-legal-vector-width"="128"
-// CHECK: attributes #2 ={{.*}}"min-legal-vector-width"="0"
+// CHECK-NOT: attributes #2 ={{.*}}"min-legal-vector-width"="0"

diff  --git a/clang/test/CodeGenCXX/dllexport-ctor-closure.cpp b/clang/test/CodeGenCXX/dllexport-ctor-closure.cpp
index 42401e5ecd5d..882cc71e57f7 100644
--- a/clang/test/CodeGenCXX/dllexport-ctor-closure.cpp
+++ b/clang/test/CodeGenCXX/dllexport-ctor-closure.cpp
@@ -5,7 +5,7 @@
 
 struct CtorWithClosure {
   __declspec(dllexport) CtorWithClosure(...) {}
-// CHECK-LABEL: define weak_odr dso_local dllexport x86_thiscallcc void @"??_FCtorWithClosure@@QAEXXZ"({{.*}}) {{#[0-9]+}} comdat
+// CHECK-LABEL: define weak_odr dso_local dllexport x86_thiscallcc void @"??_FCtorWithClosure@@QAEXXZ"({{.*}}) comdat
 // CHECK:   %[[this_addr:.*]] = alloca %struct.CtorWithClosure*, align 4
 // CHECK:   store %struct.CtorWithClosure* %this, %struct.CtorWithClosure** %[[this_addr]], align 4
 // CHECK:   %[[this:.*]] = load %struct.CtorWithClosure*, %struct.CtorWithClosure** %[[this_addr]]
@@ -17,7 +17,7 @@ struct CtorWithClosureOutOfLine {
   __declspec(dllexport) CtorWithClosureOutOfLine(...);
 };
 CtorWithClosureOutOfLine::CtorWithClosureOutOfLine(...) {}
-// CHECK-LABEL: define weak_odr dso_local dllexport x86_thiscallcc void @"??_FCtorWithClosureOutOfLine@@QAEXXZ"({{.*}}) {{#[0-9]+}} comdat
+// CHECK-LABEL: define weak_odr dso_local dllexport x86_thiscallcc void @"??_FCtorWithClosureOutOfLine@@QAEXXZ"({{.*}}) comdat
 
 #define DELETE_IMPLICIT_MEMBERS(ClassName) \
     ClassName(ClassName &&) = delete; \
@@ -28,7 +28,7 @@ CtorWithClosureOutOfLine::CtorWithClosureOutOfLine(...) {}
 struct __declspec(dllexport) ClassWithClosure {
   DELETE_IMPLICIT_MEMBERS(ClassWithClosure);
   ClassWithClosure(...) {}
-// CHECK-LABEL: define weak_odr dso_local dllexport x86_thiscallcc void @"??_FClassWithClosure@@QAEXXZ"({{.*}}) {{#[0-9]+}} comdat
+// CHECK-LABEL: define weak_odr dso_local dllexport x86_thiscallcc void @"??_FClassWithClosure@@QAEXXZ"({{.*}}) comdat
 // CHECK:   %[[this_addr:.*]] = alloca %struct.ClassWithClosure*, align 4
 // CHECK:   store %struct.ClassWithClosure* %this, %struct.ClassWithClosure** %[[this_addr]], align 4
 // CHECK:   %[[this:.*]] = load %struct.ClassWithClosure*, %struct.ClassWithClosure** %[[this_addr]]
@@ -44,10 +44,10 @@ template struct __declspec(dllexport) TemplateWithClosure<char>;
 extern template struct TemplateWithClosure<int>;
 template struct __declspec(dllexport) TemplateWithClosure<int>;
 
-// CHECK-LABEL: define weak_odr dso_local dllexport x86_thiscallcc void @"??_F?$TemplateWithClosure at D@@QAEXXZ"({{.*}}) {{#[0-9]+}} comdat
+// CHECK-LABEL: define weak_odr dso_local dllexport x86_thiscallcc void @"??_F?$TemplateWithClosure at D@@QAEXXZ"({{.*}}) comdat
 // CHECK:   call {{.*}} @"??0?$TemplateWithClosure at D@@QAE at H@Z"({{.*}}, i32 1)
 
-// CHECK-LABEL: define weak_odr dso_local dllexport x86_thiscallcc void @"??_F?$TemplateWithClosure at H@@QAEXXZ"({{.*}}) {{#[0-9]+}} comdat
+// CHECK-LABEL: define weak_odr dso_local dllexport x86_thiscallcc void @"??_F?$TemplateWithClosure at H@@QAEXXZ"({{.*}}) comdat
 // CHECK:   call {{.*}} @"??0?$TemplateWithClosure at H@@QAE at H@Z"({{.*}}, i32 4)
 
 template <typename T> struct __declspec(dllexport) ExportedTemplateWithClosure {
@@ -55,7 +55,7 @@ template <typename T> struct __declspec(dllexport) ExportedTemplateWithClosure {
 };
 template <> ExportedTemplateWithClosure<int>::ExportedTemplateWithClosure(int); // Don't try to emit the closure for a declaration.
 template <> ExportedTemplateWithClosure<int>::ExportedTemplateWithClosure(int) {};
-// CHECK-LABEL: define weak_odr dso_local dllexport x86_thiscallcc void @"??_F?$ExportedTemplateWithClosure at H@@QAEXXZ"({{.*}}) {{#[0-9]+}} comdat
+// CHECK-LABEL: define weak_odr dso_local dllexport x86_thiscallcc void @"??_F?$ExportedTemplateWithClosure at H@@QAEXXZ"({{.*}}) comdat
 // CHECK:   call {{.*}} @"??0?$ExportedTemplateWithClosure at H@@QAE at H@Z"({{.*}}, i32 4)
 
 struct __declspec(dllexport) NestedOuter {
@@ -67,8 +67,8 @@ struct __declspec(dllexport) NestedOuter {
   };
 };
 
-// CHECK-LABEL: define weak_odr dso_local dllexport x86_thiscallcc void @"??_FNestedOuter@@QAEXXZ"({{.*}}) {{#[0-9]+}} comdat
-// CHECK-LABEL: define weak_odr dso_local dllexport x86_thiscallcc void @"??_FNestedInner at NestedOuter@@QAEXXZ"({{.*}}) {{#[0-9]+}} comdat
+// CHECK-LABEL: define weak_odr dso_local dllexport x86_thiscallcc void @"??_FNestedOuter@@QAEXXZ"({{.*}}) comdat
+// CHECK-LABEL: define weak_odr dso_local dllexport x86_thiscallcc void @"??_FNestedInner at NestedOuter@@QAEXXZ"({{.*}}) comdat
 
 struct HasDtor {
   ~HasDtor();

diff  --git a/clang/test/CodeGenCXX/dllexport.cpp b/clang/test/CodeGenCXX/dllexport.cpp
index a0f53c8e2e9f..6c84c21c5f35 100644
--- a/clang/test/CodeGenCXX/dllexport.cpp
+++ b/clang/test/CodeGenCXX/dllexport.cpp
@@ -535,7 +535,7 @@ struct SomeTemplate {
 // MSVC2013-DAG: define weak_odr dso_local dllexport {{.+}} @"??4?$SomeTemplate at H@@Q{{.+}}0 at A{{.+}}0@@Z"
 struct __declspec(dllexport) InheritFromTemplate : SomeTemplate<int> {};
 
-// M32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @"??_F?$SomeTemplate at H@@QAEXXZ"({{.*}}) {{#[0-9]+}} comdat
+// M32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @"??_F?$SomeTemplate at H@@QAEXXZ"({{.*}}) comdat
 
 namespace PR23801 {
 template <typename>
@@ -552,7 +552,7 @@ struct __declspec(dllexport) B {
 
 }
 //
-// M32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @"??_FB at PR23801@@QAEXXZ"({{.*}}) {{#[0-9]+}} comdat
+// M32-DAG: define weak_odr dso_local dllexport x86_thiscallcc void @"??_FB at PR23801@@QAEXXZ"({{.*}}) comdat
 
 struct __declspec(dllexport) T {
   // Copy assignment operator: