r263048 - ARM & AArch64: convert asm tests to LLVM IR and restrict optimizations.

Wed Mar 9 10:54:43 PST 2016

Modified: cfe/trunk/test/CodeGen/aarch64-neon-ldst-one.c
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/CodeGen/aarch64-neon-ldst-one.c?rev=263048&r1=263047&r2=263048&view=diff
==============================================================================

--- cfe/trunk/test/CodeGen/aarch64-neon-ldst-one.c (original)
+++ cfe/trunk/test/CodeGen/aarch64-neon-ldst-one.c Wed Mar  9 12:54:42 2016
@@ -1,2049 +1,7977 @@
-// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+// RUN:   -fallow-half-arguments-and-returns -emit-llvm -o - %s \
+// RUN: | opt -S -mem2reg | FileCheck %s
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define <16 x i8> @test_vld1q_dup_u8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
+// CHECK:   ret <16 x i8> [[LANE]]
 uint8x16_t test_vld1q_dup_u8(uint8_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_u8
   return vld1q_dup_u8(a);
-  // CHECK: ld1r {{{ *v[0-9]+.16b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vld1q_dup_u16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
+// CHECK:   ret <8 x i16> [[LANE]]
 uint16x8_t test_vld1q_dup_u16(uint16_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_u16
   return vld1q_dup_u16(a);
-  // CHECK: ld1r {{{ *v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vld1q_dup_u32(i32* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP2:%.*]] = load i32, i32* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   ret <4 x i32> [[LANE]]
 uint32x4_t test_vld1q_dup_u32(uint32_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_u32
   return vld1q_dup_u32(a);
-  // CHECK: ld1r {{{ *v[0-9]+.4s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vld1q_dup_u64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   ret <2 x i64> [[LANE]]
 uint64x2_t test_vld1q_dup_u64(uint64_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_u64
   return vld1q_dup_u64(a);
-  // CHECK: ld1r {{{ *v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vld1q_dup_s8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
+// CHECK:   ret <16 x i8> [[LANE]]
 int8x16_t test_vld1q_dup_s8(int8_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_s8
   return vld1q_dup_s8(a);
-  // CHECK: ld1r {{{ *v[0-9]+.16b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vld1q_dup_s16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
+// CHECK:   ret <8 x i16> [[LANE]]
 int16x8_t test_vld1q_dup_s16(int16_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_s16
   return vld1q_dup_s16(a);
-  // CHECK: ld1r {{{ *v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vld1q_dup_s32(i32* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP2:%.*]] = load i32, i32* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   ret <4 x i32> [[LANE]]
 int32x4_t test_vld1q_dup_s32(int32_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_s32
   return vld1q_dup_s32(a);
-  // CHECK: ld1r {{{ *v[0-9]+.4s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vld1q_dup_s64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   ret <2 x i64> [[LANE]]
 int64x2_t test_vld1q_dup_s64(int64_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_s64
   return vld1q_dup_s64(a);
-  // CHECK: ld1r {{{ *v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x half> @test_vld1q_dup_f16(half* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[LANE]] to <8 x half>
+// CHECK:   ret <8 x half> [[TMP4]]
 float16x8_t test_vld1q_dup_f16(float16_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_f16
   return vld1q_dup_f16(a);
-  // CHECK: ld1r {{{ *v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vld1q_dup_f32(float* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to float*
+// CHECK:   [[TMP2:%.*]] = load float, float* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   ret <4 x float> [[LANE]]
 float32x4_t test_vld1q_dup_f32(float32_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_f32
   return vld1q_dup_f32(a);
-  // CHECK: ld1r {{{ *v[0-9]+.4s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vld1q_dup_f64(double* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to double*
+// CHECK:   [[TMP2:%.*]] = load double, double* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   ret <2 x double> [[LANE]]
 float64x2_t test_vld1q_dup_f64(float64_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_f64
   return vld1q_dup_f64(a);
-  // CHECK: ld1r {{{ *v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vld1q_dup_p8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
+// CHECK:   ret <16 x i8> [[LANE]]
 poly8x16_t test_vld1q_dup_p8(poly8_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_p8
   return vld1q_dup_p8(a);
-  // CHECK: ld1r {{{ *v[0-9]+.16b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vld1q_dup_p16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
+// CHECK:   ret <8 x i16> [[LANE]]
 poly16x8_t test_vld1q_dup_p16(poly16_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_p16
   return vld1q_dup_p16(a);
-  // CHECK: ld1r {{{ *v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vld1q_dup_p64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   ret <2 x i64> [[LANE]]
 poly64x2_t test_vld1q_dup_p64(poly64_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_p64
   return vld1q_dup_p64(a);
-  // CHECK: ld1r {{{ *v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vld1_dup_u8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK:   ret <8 x i8> [[LANE]]
 uint8x8_t test_vld1_dup_u8(uint8_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_u8
   return vld1_dup_u8(a);
-  // CHECK: ld1r {{{ *v[0-9]+.8b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vld1_dup_u16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   ret <4 x i16> [[LANE]]
 uint16x4_t test_vld1_dup_u16(uint16_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_u16
   return vld1_dup_u16(a);
-  // CHECK: ld1r {{{ *v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vld1_dup_u32(i32* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP2:%.*]] = load i32, i32* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   ret <2 x i32> [[LANE]]
 uint32x2_t test_vld1_dup_u32(uint32_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_u32
   return vld1_dup_u32(a);
-  // CHECK: ld1r {{{ *v[0-9]+.2s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vld1_dup_u64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[LANE]]
 uint64x1_t test_vld1_dup_u64(uint64_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_u64
   return vld1_dup_u64(a);
-  // CHECK: {{ld1r { v[0-9]+.1d }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vld1_dup_s8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK:   ret <8 x i8> [[LANE]]
 int8x8_t test_vld1_dup_s8(int8_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_s8
   return vld1_dup_s8(a);
-  // CHECK: ld1r {{{ *v[0-9]+.8b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vld1_dup_s16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   ret <4 x i16> [[LANE]]
 int16x4_t test_vld1_dup_s16(int16_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_s16
   return vld1_dup_s16(a);
-  // CHECK: ld1r {{{ *v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vld1_dup_s32(i32* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP2:%.*]] = load i32, i32* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   ret <2 x i32> [[LANE]]
 int32x2_t test_vld1_dup_s32(int32_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_s32
   return vld1_dup_s32(a);
-  // CHECK: ld1r {{{ *v[0-9]+.2s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vld1_dup_s64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[LANE]]
 int64x1_t test_vld1_dup_s64(int64_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_s64
   return vld1_dup_s64(a);
-  // CHECK: {{ld1r { v[0-9]+.1d }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x half> @test_vld1_dup_f16(half* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <4 x half>
+// CHECK:   ret <4 x half> [[TMP4]]
 float16x4_t test_vld1_dup_f16(float16_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_f16
   return vld1_dup_f16(a);
-  // CHECK: ld1r {{{ *v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vld1_dup_f32(float* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to float*
+// CHECK:   [[TMP2:%.*]] = load float, float* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   ret <2 x float> [[LANE]]
 float32x2_t test_vld1_dup_f32(float32_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_f32
   return vld1_dup_f32(a);
-  // CHECK: ld1r {{{ *v[0-9]+.2s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <1 x double> @test_vld1_dup_f64(double* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to double*
+// CHECK:   [[TMP2:%.*]] = load double, double* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <1 x double> undef, double [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   ret <1 x double> [[LANE]]
 float64x1_t test_vld1_dup_f64(float64_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_f64
   return vld1_dup_f64(a);
-  // CHECK: {{ld1r { v[0-9]+.1d }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vld1_dup_p8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK:   ret <8 x i8> [[LANE]]
 poly8x8_t test_vld1_dup_p8(poly8_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_p8
   return vld1_dup_p8(a);
-  // CHECK: ld1r {{{ *v[0-9]+.8b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vld1_dup_p16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   ret <4 x i16> [[LANE]]
 poly16x4_t test_vld1_dup_p16(poly16_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_p16
   return vld1_dup_p16(a);
-  // CHECK: ld1r {{{ *v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vld1_dup_p64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[LANE]]
 poly64x1_t test_vld1_dup_p64(poly64_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_p64
   return vld1_dup_p64(a);
-  // CHECK: {{ld1r { v[0-9]+.1d }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x16x2_t @test_vld2q_dup_u8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x2_t [[TMP4]]
 uint8x16x2_t test_vld2q_dup_u8(uint8_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_u8
   return vld2q_dup_u8(a);
-  // CHECK: ld2r {{{ *v[0-9]+.16b, v[0-9]+.16b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x8x2_t @test_vld2q_dup_u16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x2_t [[TMP6]]
 uint16x8x2_t test_vld2q_dup_u16(uint16_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_u16
   return vld2q_dup_u16(a);
-  // CHECK: ld2r {{{ *v[0-9]+.8h, v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x4x2_t @test_vld2q_dup_u32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2r.v4i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2]], { <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x2_t [[TMP6]]
 uint32x4x2_t test_vld2q_dup_u32(uint32_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_u32
   return vld2q_dup_u32(a);
-  // CHECK: ld2r {{{ *v[0-9]+.4s, v[0-9]+.4s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x2x2_t @test_vld2q_dup_u64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint64x2x2_t [[TMP6]]
 uint64x2x2_t test_vld2q_dup_u64(uint64_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_u64
   return vld2q_dup_u64(a);
-  // CHECK: ld2r {{{ *v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x16x2_t @test_vld2q_dup_s8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x2_t [[TMP4]]
 int8x16x2_t test_vld2q_dup_s8(int8_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_s8
   return vld2q_dup_s8(a);
-  // CHECK: ld2r {{{ *v[0-9]+.16b, v[0-9]+.16b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x8x2_t @test_vld2q_dup_s16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x2_t [[TMP6]]
 int16x8x2_t test_vld2q_dup_s16(int16_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_s16
   return vld2q_dup_s16(a);
-  // CHECK: ld2r {{{ *v[0-9]+.8h, v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x4x2_t @test_vld2q_dup_s32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2r.v4i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2]], { <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x2_t [[TMP6]]
 int32x4x2_t test_vld2q_dup_s32(int32_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_s32
   return vld2q_dup_s32(a);
-  // CHECK: ld2r {{{ *v[0-9]+.4s, v[0-9]+.4s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x2x2_t @test_vld2q_dup_s64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x2x2_t, %struct.int64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int64x2x2_t [[TMP6]]
 int64x2x2_t test_vld2q_dup_s64(int64_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_s64
   return vld2q_dup_s64(a);
-  // CHECK: ld2r {{{ *v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x8x2_t @test_vld2q_dup_f16(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x8x2_t, %struct.float16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float16x8x2_t [[TMP6]]
 float16x8x2_t test_vld2q_dup_f16(float16_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_f16
   return vld2q_dup_f16(a);
-  // CHECK: ld2r {{{ *v[0-9]+.8h, v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x4x2_t @test_vld2q_dup_f32(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK:   [[VLD2:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2r.v4f32.p0f32(float* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float> } [[VLD2]], { <4 x float>, <4 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x2_t [[TMP6]]
 float32x4x2_t test_vld2q_dup_f32(float32_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_f32
   return vld2q_dup_f32(a);
-  // CHECK: ld2r {{{ *v[0-9]+.4s, v[0-9]+.4s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x2x2_t @test_vld2q_dup_f64(double* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
+// CHECK:   [[VLD2:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2r.v2f64.p0f64(double* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x double>, <2 x double> }*
+// CHECK:   store { <2 x double>, <2 x double> } [[VLD2]], { <2 x double>, <2 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x2x2_t, %struct.float64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float64x2x2_t [[TMP6]]
 float64x2x2_t test_vld2q_dup_f64(float64_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_f64
   return vld2q_dup_f64(a);
-  // CHECK: ld2r {{{ *v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x16x2_t @test_vld2q_dup_p8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x2_t [[TMP4]]
 poly8x16x2_t test_vld2q_dup_p8(poly8_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_p8
   return vld2q_dup_p8(a);
-  // CHECK: ld2r {{{ *v[0-9]+.16b, v[0-9]+.16b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x8x2_t @test_vld2q_dup_p16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x2_t [[TMP6]]
 poly16x8x2_t test_vld2q_dup_p16(poly16_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_p16
   return vld2q_dup_p16(a);
-  // CHECK: ld2r {{{ *v[0-9]+.8h, v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x2x2_t @test_vld2q_dup_p64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly64x2x2_t [[TMP6]]
 poly64x2x2_t test_vld2q_dup_p64(poly64_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_p64
   return vld2q_dup_p64(a);
-  // CHECK: ld2r {{{ *v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x8x2_t @test_vld2_dup_u8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x2_t [[TMP4]]
 uint8x8x2_t test_vld2_dup_u8(uint8_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_u8
   return vld2_dup_u8(a);
-  // CHECK: ld2r {{{ *v[0-9]+.8b, v[0-9]+.8b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x4x2_t @test_vld2_dup_u16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x2_t [[TMP6]]
 uint16x4x2_t test_vld2_dup_u16(uint16_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_u16
   return vld2_dup_u16(a);
-  // CHECK: ld2r {{{ *v[0-9]+.4h, v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x2x2_t @test_vld2_dup_u32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2r.v2i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2]], { <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x2_t [[TMP6]]
 uint32x2x2_t test_vld2_dup_u32(uint32_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_u32
   return vld2_dup_u32(a);
-  // CHECK: ld2r {{{ *v[0-9]+.2s, v[0-9]+.2s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x1x2_t @test_vld2_dup_u64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint64x1x2_t [[TMP6]]
 uint64x1x2_t test_vld2_dup_u64(uint64_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_u64
   return vld2_dup_u64(a);
-  // CHECK: {{ld1|ld2r}} {{{ *v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x8x2_t @test_vld2_dup_s8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x2_t [[TMP4]]
 int8x8x2_t test_vld2_dup_s8(int8_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_s8
   return vld2_dup_s8(a);
-  // CHECK: ld2r {{{ *v[0-9]+.8b, v[0-9]+.8b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x4x2_t @test_vld2_dup_s16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x2_t [[TMP6]]
 int16x4x2_t test_vld2_dup_s16(int16_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_s16
   return vld2_dup_s16(a);
-  // CHECK: ld2r {{{ *v[0-9]+.4h, v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x2x2_t @test_vld2_dup_s32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2r.v2i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2]], { <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x2_t [[TMP6]]
 int32x2x2_t test_vld2_dup_s32(int32_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_s32
   return vld2_dup_s32(a);
-  // CHECK: ld2r {{{ *v[0-9]+.2s, v[0-9]+.2s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x1x2_t @test_vld2_dup_s64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x1x2_t, %struct.int64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int64x1x2_t [[TMP6]]
 int64x1x2_t test_vld2_dup_s64(int64_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_s64
   return vld2_dup_s64(a);
-  // CHECK: {{ld1|ld2r}} {{{ *v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x4x2_t @test_vld2_dup_f16(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x4x2_t, %struct.float16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x2_t [[TMP6]]
 float16x4x2_t test_vld2_dup_f16(float16_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_f16
   return vld2_dup_f16(a);
-  // CHECK: ld2r {{{ *v[0-9]+.4h, v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x2x2_t @test_vld2_dup_f32(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK:   [[VLD2:%.*]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2r.v2f32.p0f32(float* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float> } [[VLD2]], { <2 x float>, <2 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x2_t [[TMP6]]
 float32x2x2_t test_vld2_dup_f32(float32_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_f32
   return vld2_dup_f32(a);
-  // CHECK: ld2r {{{ *v[0-9]+.2s, v[0-9]+.2s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x1x2_t @test_vld2_dup_f64(double* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
+// CHECK:   [[VLD2:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2r.v1f64.p0f64(double* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x double>, <1 x double> }*
+// CHECK:   store { <1 x double>, <1 x double> } [[VLD2]], { <1 x double>, <1 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x1x2_t, %struct.float64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float64x1x2_t [[TMP6]]
 float64x1x2_t test_vld2_dup_f64(float64_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_f64
   return vld2_dup_f64(a);
-  // CHECK: {{ld1|ld2r}} {{{ *v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x8x2_t @test_vld2_dup_p8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x2_t [[TMP4]]
 poly8x8x2_t test_vld2_dup_p8(poly8_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_p8
   return vld2_dup_p8(a);
-  // CHECK: ld2r {{{ *v[0-9]+.8b, v[0-9]+.8b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x4x2_t @test_vld2_dup_p16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x2_t [[TMP6]]
 poly16x4x2_t test_vld2_dup_p16(poly16_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_p16
   return vld2_dup_p16(a);
-  // CHECK: ld2r {{{ *v[0-9]+.4h, v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x1x2_t @test_vld2_dup_p64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly64x1x2_t [[TMP6]]
 poly64x1x2_t test_vld2_dup_p64(poly64_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_p64
   return vld2_dup_p64(a);
-  // CHECK: {{ld1|ld2r}} {{{ *v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x16x3_t @test_vld3q_dup_u8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x3_t [[TMP4]]
 uint8x16x3_t test_vld3q_dup_u8(uint8_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_u8
   return vld3q_dup_u8(a);
-  // CHECK: ld3r {{{ *v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x8x3_t @test_vld3q_dup_u16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x3_t [[TMP6]]
 uint16x8x3_t test_vld3q_dup_u16(uint16_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_u16
   return vld3q_dup_u16(a);
-  // CHECK: ld3r {{{ *v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x4x3_t @test_vld3q_dup_u32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3r.v4i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x3_t [[TMP6]]
 uint32x4x3_t test_vld3q_dup_u32(uint32_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_u32
   return vld3q_dup_u32(a);
-  // CHECK: ld3r {{{ *v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x2x3_t @test_vld3q_dup_u64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint64x2x3_t [[TMP6]]
 uint64x2x3_t test_vld3q_dup_u64(uint64_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_u64
   return vld3q_dup_u64(a);
-  // CHECK: ld3r {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x16x3_t @test_vld3q_dup_s8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.int8x16x3_t, %struct.int8x16x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x3_t [[TMP4]]
 int8x16x3_t test_vld3q_dup_s8(int8_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_s8
   return vld3q_dup_s8(a);
-  // CHECK: ld3r {{{ *v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x8x3_t @test_vld3q_dup_s16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x8x3_t, %struct.int16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x3_t [[TMP6]]
 int16x8x3_t test_vld3q_dup_s16(int16_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_s16
   return vld3q_dup_s16(a);
-  // CHECK: ld3r {{{ *v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x4x3_t @test_vld3q_dup_s32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3r.v4i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x4x3_t, %struct.int32x4x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x3_t [[TMP6]]
 int32x4x3_t test_vld3q_dup_s32(int32_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_s32
   return vld3q_dup_s32(a);
-  // CHECK: ld3r {{{ *v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x2x3_t @test_vld3q_dup_s64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x2x3_t, %struct.int64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int64x2x3_t [[TMP6]]
 int64x2x3_t test_vld3q_dup_s64(int64_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_s64
   return vld3q_dup_s64(a);
-  // CHECK: ld3r {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x8x3_t @test_vld3q_dup_f16(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x8x3_t, %struct.float16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float16x8x3_t [[TMP6]]
 float16x8x3_t test_vld3q_dup_f16(float16_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_f16
   return vld3q_dup_f16(a);
-  // CHECK: ld3r {{{ *v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x4x3_t @test_vld3q_dup_f32(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK:   [[VLD3:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3r.v4f32.p0f32(float* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float>, <4 x float> } [[VLD3]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x4x3_t, %struct.float32x4x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x3_t [[TMP6]]
 float32x4x3_t test_vld3q_dup_f32(float32_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_f32
   return vld3q_dup_f32(a);
-  // CHECK: ld3r {{{ *v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x2x3_t @test_vld3q_dup_f64(double* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
+// CHECK:   [[VLD3:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3r.v2f64.p0f64(double* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x double>, <2 x double>, <2 x double> }*
+// CHECK:   store { <2 x double>, <2 x double>, <2 x double> } [[VLD3]], { <2 x double>, <2 x double>, <2 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x2x3_t, %struct.float64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float64x2x3_t [[TMP6]]
 float64x2x3_t test_vld3q_dup_f64(float64_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_f64
   return vld3q_dup_f64(a);
-  // CHECK: ld3r {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x16x3_t @test_vld3q_dup_p8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x3_t [[TMP4]]
 poly8x16x3_t test_vld3q_dup_p8(poly8_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_p8
   return vld3q_dup_p8(a);
-  // CHECK: ld3r {{{ *v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x8x3_t @test_vld3q_dup_p16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x3_t [[TMP6]]
 poly16x8x3_t test_vld3q_dup_p16(poly16_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_p16
   return vld3q_dup_p16(a);
-  // CHECK: ld3r {{{ *v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x2x3_t @test_vld3q_dup_p64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly64x2x3_t [[TMP6]]
 poly64x2x3_t test_vld3q_dup_p64(poly64_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_p64
   return vld3q_dup_p64(a);
-  // CHECK: ld3r {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x8x3_t @test_vld3_dup_u8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x3_t [[TMP4]]
 uint8x8x3_t test_vld3_dup_u8(uint8_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_u8
   return vld3_dup_u8(a);
-  // CHECK: ld3r {{{ *v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x4x3_t @test_vld3_dup_u16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x3_t [[TMP6]]
 uint16x4x3_t test_vld3_dup_u16(uint16_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_u16
   return vld3_dup_u16(a);
-  // CHECK: ld3r {{{ *v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x2x3_t @test_vld3_dup_u32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3r.v2i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x3_t [[TMP6]]
 uint32x2x3_t test_vld3_dup_u32(uint32_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_u32
   return vld3_dup_u32(a);
-  // CHECK: ld3r {{{ *v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x1x3_t @test_vld3_dup_u64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint64x1x3_t [[TMP6]]
 uint64x1x3_t test_vld3_dup_u64(uint64_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_u64
   return vld3_dup_u64(a);
-  // CHECK: {{ld1|ld3r}} {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x8x3_t @test_vld3_dup_s8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.int8x8x3_t, %struct.int8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x3_t [[TMP4]]
 int8x8x3_t test_vld3_dup_s8(int8_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_s8
   return vld3_dup_s8(a);
-  // CHECK: ld3r {{{ *v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x4x3_t @test_vld3_dup_s16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x4x3_t, %struct.int16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x3_t [[TMP6]]
 int16x4x3_t test_vld3_dup_s16(int16_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_s16
   return vld3_dup_s16(a);
-  // CHECK: ld3r {{{ *v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x2x3_t @test_vld3_dup_s32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3r.v2i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x2x3_t, %struct.int32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x3_t [[TMP6]]
 int32x2x3_t test_vld3_dup_s32(int32_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_s32
   return vld3_dup_s32(a);
-  // CHECK: ld3r {{{ *v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x1x3_t @test_vld3_dup_s64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x1x3_t, %struct.int64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int64x1x3_t [[TMP6]]
 int64x1x3_t test_vld3_dup_s64(int64_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_s64
   return vld3_dup_s64(a);
-  // CHECK: {{ld1|ld3r}} {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x4x3_t @test_vld3_dup_f16(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x4x3_t, %struct.float16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x3_t [[TMP6]]
 float16x4x3_t test_vld3_dup_f16(float16_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_f16
   return vld3_dup_f16(a);
-  // CHECK: ld3r {{{ *v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x2x3_t @test_vld3_dup_f32(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK:   [[VLD3:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3r.v2f32.p0f32(float* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float>, <2 x float> } [[VLD3]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x2x3_t, %struct.float32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x3_t [[TMP6]]
 float32x2x3_t test_vld3_dup_f32(float32_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_f32
   return vld3_dup_f32(a);
-  // CHECK: ld3r {{{ *v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x1x3_t @test_vld3_dup_f64(double* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
+// CHECK:   [[VLD3:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3r.v1f64.p0f64(double* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x double>, <1 x double>, <1 x double> }*
+// CHECK:   store { <1 x double>, <1 x double>, <1 x double> } [[VLD3]], { <1 x double>, <1 x double>, <1 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x1x3_t, %struct.float64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float64x1x3_t [[TMP6]]
 float64x1x3_t test_vld3_dup_f64(float64_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_f64
   return vld3_dup_f64(a);
-  // CHECK: {{ld1|ld3r}} {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x8x3_t @test_vld3_dup_p8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x3_t [[TMP4]]
 poly8x8x3_t test_vld3_dup_p8(poly8_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_p8
   return vld3_dup_p8(a);
-  // CHECK: ld3r {{{ *v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x4x3_t @test_vld3_dup_p16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x3_t [[TMP6]]
 poly16x4x3_t test_vld3_dup_p16(poly16_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_p16
   return vld3_dup_p16(a);
-  // CHECK: ld3r {{{ *v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x1x3_t @test_vld3_dup_p64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly64x1x3_t [[TMP6]]
 poly64x1x3_t test_vld3_dup_p64(poly64_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_p64
   return vld3_dup_p64(a);
-  // CHECK: {{ld1|ld3r}} {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x16x4_t @test_vld4q_dup_u8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x4_t [[TMP4]]
 uint8x16x4_t test_vld4q_dup_u8(uint8_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_u8
   return vld4q_dup_u8(a);
-  // CHECK: ld4r {{{ *v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x8x4_t @test_vld4q_dup_u16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x4_t [[TMP6]]
 uint16x8x4_t test_vld4q_dup_u16(uint16_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_u16
   return vld4q_dup_u16(a);
-  // CHECK: ld4r {{{ *v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x4x4_t @test_vld4q_dup_u32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4r.v4i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x4_t [[TMP6]]
 uint32x4x4_t test_vld4q_dup_u32(uint32_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_u32
   return vld4q_dup_u32(a);
-  // CHECK: ld4r {{{ *v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x2x4_t @test_vld4q_dup_u64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint64x2x4_t [[TMP6]]
 uint64x2x4_t test_vld4q_dup_u64(uint64_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_u64
   return vld4q_dup_u64(a);
-  // CHECK: ld4r {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x16x4_t @test_vld4q_dup_s8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.int8x16x4_t, %struct.int8x16x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x4_t [[TMP4]]
 int8x16x4_t test_vld4q_dup_s8(int8_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_s8
   return vld4q_dup_s8(a);
-  // CHECK: ld4r {{{ *v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x8x4_t @test_vld4q_dup_s16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x8x4_t, %struct.int16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x4_t [[TMP6]]
 int16x8x4_t test_vld4q_dup_s16(int16_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_s16
   return vld4q_dup_s16(a);
-  // CHECK: ld4r {{{ *v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x4x4_t @test_vld4q_dup_s32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4r.v4i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x4x4_t, %struct.int32x4x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x4_t [[TMP6]]
 int32x4x4_t test_vld4q_dup_s32(int32_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_s32
   return vld4q_dup_s32(a);
-  // CHECK: ld4r {{{ *v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x2x4_t @test_vld4q_dup_s64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x2x4_t, %struct.int64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int64x2x4_t [[TMP6]]
 int64x2x4_t test_vld4q_dup_s64(int64_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_s64
   return vld4q_dup_s64(a);
-  // CHECK: ld4r {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x8x4_t @test_vld4q_dup_f16(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x8x4_t, %struct.float16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float16x8x4_t [[TMP6]]
 float16x8x4_t test_vld4q_dup_f16(float16_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_f16
   return vld4q_dup_f16(a);
-  // CHECK: ld4r {{{ *v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x4x4_t @test_vld4q_dup_f32(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK:   [[VLD4:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4r.v4f32.p0f32(float* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x4x4_t, %struct.float32x4x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x4_t [[TMP6]]
 float32x4x4_t test_vld4q_dup_f32(float32_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_f32
   return vld4q_dup_f32(a);
-  // CHECK: ld4r {{{ *v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x2x4_t @test_vld4q_dup_f64(double* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
+// CHECK:   [[VLD4:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4r.v2f64.p0f64(double* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x double>, <2 x double>, <2 x double>, <2 x double> }*
+// CHECK:   store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4]], { <2 x double>, <2 x double>, <2 x double>, <2 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x2x4_t, %struct.float64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float64x2x4_t [[TMP6]]
 float64x2x4_t test_vld4q_dup_f64(float64_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_f64
   return vld4q_dup_f64(a);
-  // CHECK: ld4r {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x16x4_t @test_vld4q_dup_p8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x4_t [[TMP4]]
 poly8x16x4_t test_vld4q_dup_p8(poly8_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_p8
   return vld4q_dup_p8(a);
-  // CHECK: ld4r {{{ *v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x8x4_t @test_vld4q_dup_p16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x4_t [[TMP6]]
 poly16x8x4_t test_vld4q_dup_p16(poly16_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_p16
   return vld4q_dup_p16(a);
-  // CHECK: ld4r {{{ *v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x2x4_t @test_vld4q_dup_p64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly64x2x4_t [[TMP6]]
 poly64x2x4_t test_vld4q_dup_p64(poly64_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_p64
   return vld4q_dup_p64(a);
-  // CHECK: ld4r {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x8x4_t @test_vld4_dup_u8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x4_t [[TMP4]]
 uint8x8x4_t test_vld4_dup_u8(uint8_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_u8
   return vld4_dup_u8(a);
-  // CHECK: ld4r {{{ *v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x4x4_t @test_vld4_dup_u16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x4_t [[TMP6]]
 uint16x4x4_t test_vld4_dup_u16(uint16_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_u16
   return vld4_dup_u16(a);
-  // CHECK: ld4r {{{ *v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x2x4_t @test_vld4_dup_u32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4r.v2i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x4_t [[TMP6]]
 uint32x2x4_t test_vld4_dup_u32(uint32_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_u32
   return vld4_dup_u32(a);
-  // CHECK: ld4r {{{ *v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x1x4_t @test_vld4_dup_u64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint64x1x4_t [[TMP6]]
 uint64x1x4_t test_vld4_dup_u64(uint64_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_u64
   return vld4_dup_u64(a);
-  // CHECK: {{ld1|ld4r}} {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x8x4_t @test_vld4_dup_s8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.int8x8x4_t, %struct.int8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x4_t [[TMP4]]
 int8x8x4_t test_vld4_dup_s8(int8_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_s8
   return vld4_dup_s8(a);
-  // CHECK: ld4r {{{ *v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x4x4_t @test_vld4_dup_s16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x4x4_t, %struct.int16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x4_t [[TMP6]]
 int16x4x4_t test_vld4_dup_s16(int16_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_s16
   return vld4_dup_s16(a);
-  // CHECK: ld4r {{{ *v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x2x4_t @test_vld4_dup_s32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4r.v2i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x2x4_t, %struct.int32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x4_t [[TMP6]]
 int32x2x4_t test_vld4_dup_s32(int32_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_s32
   return vld4_dup_s32(a);
-  // CHECK: ld4r {{{ *v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x1x4_t @test_vld4_dup_s64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x1x4_t, %struct.int64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int64x1x4_t [[TMP6]]
 int64x1x4_t test_vld4_dup_s64(int64_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_s64
   return vld4_dup_s64(a);
-  // CHECK: {{ld1|ld4r}} {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x4x4_t @test_vld4_dup_f16(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x4x4_t, %struct.float16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x4_t [[TMP6]]
 float16x4x4_t test_vld4_dup_f16(float16_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_f16
   return vld4_dup_f16(a);
-  // CHECK: ld4r {{{ *v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x2x4_t @test_vld4_dup_f32(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK:   [[VLD4:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4r.v2f32.p0f32(float* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x2x4_t, %struct.float32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x4_t [[TMP6]]
 float32x2x4_t test_vld4_dup_f32(float32_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_f32
   return vld4_dup_f32(a);
-  // CHECK: ld4r {{{ *v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x1x4_t @test_vld4_dup_f64(double* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
+// CHECK:   [[VLD4:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4r.v1f64.p0f64(double* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x double>, <1 x double>, <1 x double>, <1 x double> }*
+// CHECK:   store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4]], { <1 x double>, <1 x double>, <1 x double>, <1 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x1x4_t, %struct.float64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float64x1x4_t [[TMP6]]
 float64x1x4_t test_vld4_dup_f64(float64_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_f64
   return vld4_dup_f64(a);
-  // CHECK: {{ld1|ld4r}} {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x8x4_t @test_vld4_dup_p8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x4_t [[TMP4]]
 poly8x8x4_t test_vld4_dup_p8(poly8_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_p8
   return vld4_dup_p8(a);
-  // CHECK: ld4r {{{ *v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x4x4_t @test_vld4_dup_p16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x4_t [[TMP6]]
 poly16x4x4_t test_vld4_dup_p16(poly16_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_p16
   return vld4_dup_p16(a);
-  // CHECK: ld4r {{{ *v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x1x4_t @test_vld4_dup_p64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly64x1x4_t [[TMP6]]
 poly64x1x4_t test_vld4_dup_p64(poly64_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_p64
   return vld4_dup_p64(a);
-  // CHECK: {{ld1|ld4r}} {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vld1q_lane_u8(i8* %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
+// CHECK:   ret <16 x i8> [[VLD1_LANE]]
 uint8x16_t test_vld1q_lane_u8(uint8_t  *a, uint8x16_t b) {
-  // CHECK-LABEL: test_vld1q_lane_u8
   return vld1q_lane_u8(a, b, 15);
-  // CHECK: ld1 {{{ *v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vld1q_lane_u16(i16* %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
+// CHECK:   ret <8 x i16> [[VLD1_LANE]]
 uint16x8_t test_vld1q_lane_u16(uint16_t  *a, uint16x8_t b) {
-  // CHECK-LABEL: test_vld1q_lane_u16
   return vld1q_lane_u16(a, b, 7);
-  // CHECK: ld1 {{{ *v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vld1q_lane_u32(i32* %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP4:%.*]] = load i32, i32* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i32 3
+// CHECK:   ret <4 x i32> [[VLD1_LANE]]
 uint32x4_t test_vld1q_lane_u32(uint32_t  *a, uint32x4_t b) {
-  // CHECK-LABEL: test_vld1q_lane_u32
   return vld1q_lane_u32(a, b, 3);
-  // CHECK: ld1 {{{ *v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vld1q_lane_u64(i64* %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP4:%.*]] = load i64, i64* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP4]], i32 1
+// CHECK:   ret <2 x i64> [[VLD1_LANE]]
 uint64x2_t test_vld1q_lane_u64(uint64_t  *a, uint64x2_t b) {
-  // CHECK-LABEL: test_vld1q_lane_u64
   return vld1q_lane_u64(a, b, 1);
-  // CHECK: ld1 {{{ *v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vld1q_lane_s8(i8* %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
+// CHECK:   ret <16 x i8> [[VLD1_LANE]]
 int8x16_t test_vld1q_lane_s8(int8_t  *a, int8x16_t b) {
-  // CHECK-LABEL: test_vld1q_lane_s8
   return vld1q_lane_s8(a, b, 15);
-  // CHECK: ld1 {{{ *v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vld1q_lane_s16(i16* %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
+// CHECK:   ret <8 x i16> [[VLD1_LANE]]
 int16x8_t test_vld1q_lane_s16(int16_t  *a, int16x8_t b) {
-  // CHECK-LABEL: test_vld1q_lane_s16
   return vld1q_lane_s16(a, b, 7);
-  // CHECK: ld1 {{{ *v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vld1q_lane_s32(i32* %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP4:%.*]] = load i32, i32* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i32 3
+// CHECK:   ret <4 x i32> [[VLD1_LANE]]
 int32x4_t test_vld1q_lane_s32(int32_t  *a, int32x4_t b) {
-  // CHECK-LABEL: test_vld1q_lane_s32
   return vld1q_lane_s32(a, b, 3);
-  // CHECK: ld1 {{{ *v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vld1q_lane_s64(i64* %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP4:%.*]] = load i64, i64* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP4]], i32 1
+// CHECK:   ret <2 x i64> [[VLD1_LANE]]
 int64x2_t test_vld1q_lane_s64(int64_t  *a, int64x2_t b) {
-  // CHECK-LABEL: test_vld1q_lane_s64
   return vld1q_lane_s64(a, b, 1);
-  // CHECK: ld1 {{{ *v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x half> @test_vld1q_lane_f16(half* %a, <8 x half> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[VLD1_LANE]] to <8 x half>
+// CHECK:   ret <8 x half> [[TMP5]]
 float16x8_t test_vld1q_lane_f16(float16_t  *a, float16x8_t b) {
-  // CHECK-LABEL: test_vld1q_lane_f16
   return vld1q_lane_f16(a, b, 7);
-  // CHECK: ld1 {{{ *v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vld1q_lane_f32(float* %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to float*
+// CHECK:   [[TMP4:%.*]] = load float, float* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP4]], i32 3
+// CHECK:   ret <4 x float> [[VLD1_LANE]]
 float32x4_t test_vld1q_lane_f32(float32_t  *a, float32x4_t b) {
-  // CHECK-LABEL: test_vld1q_lane_f32
   return vld1q_lane_f32(a, b, 3);
-  // CHECK: ld1 {{{ *v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vld1q_lane_f64(double* %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to double*
+// CHECK:   [[TMP4:%.*]] = load double, double* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP4]], i32 1
+// CHECK:   ret <2 x double> [[VLD1_LANE]]
 float64x2_t test_vld1q_lane_f64(float64_t  *a, float64x2_t b) {
-  // CHECK-LABEL: test_vld1q_lane_f64
   return vld1q_lane_f64(a, b, 1);
-  // CHECK: ld1 {{{ *v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vld1q_lane_p8(i8* %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
+// CHECK:   ret <16 x i8> [[VLD1_LANE]]
 poly8x16_t test_vld1q_lane_p8(poly8_t  *a, poly8x16_t b) {
-  // CHECK-LABEL: test_vld1q_lane_p8
   return vld1q_lane_p8(a, b, 15);
-  // CHECK: ld1 {{{ *v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vld1q_lane_p16(i16* %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
+// CHECK:   ret <8 x i16> [[VLD1_LANE]]
 poly16x8_t test_vld1q_lane_p16(poly16_t  *a, poly16x8_t b) {
-  // CHECK-LABEL: test_vld1q_lane_p16
   return vld1q_lane_p16(a, b, 7);
-  // CHECK: ld1 {{{ *v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vld1q_lane_p64(i64* %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP4:%.*]] = load i64, i64* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP4]], i32 1
+// CHECK:   ret <2 x i64> [[VLD1_LANE]]
 poly64x2_t test_vld1q_lane_p64(poly64_t  *a, poly64x2_t b) {
-  // CHECK-LABEL: test_vld1q_lane_p64
   return vld1q_lane_p64(a, b, 1);
-  // CHECK: ld1 {{{ *v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vld1_lane_u8(i8* %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
+// CHECK:   ret <8 x i8> [[VLD1_LANE]]
 uint8x8_t test_vld1_lane_u8(uint8_t  *a, uint8x8_t b) {
-  // CHECK-LABEL: test_vld1_lane_u8
   return vld1_lane_u8(a, b, 7);
-  // CHECK: ld1 {{{ *v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vld1_lane_u16(i16* %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
+// CHECK:   ret <4 x i16> [[VLD1_LANE]]
 uint16x4_t test_vld1_lane_u16(uint16_t  *a, uint16x4_t b) {
-  // CHECK-LABEL: test_vld1_lane_u16
   return vld1_lane_u16(a, b, 3);
-  // CHECK: ld1 {{{ *v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vld1_lane_u32(i32* %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP4:%.*]] = load i32, i32* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i32 1
+// CHECK:   ret <2 x i32> [[VLD1_LANE]]
 uint32x2_t test_vld1_lane_u32(uint32_t  *a, uint32x2_t b) {
-  // CHECK-LABEL: test_vld1_lane_u32
   return vld1_lane_u32(a, b, 1);
-  // CHECK: ld1 {{{ *v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vld1_lane_u64(i64* %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP4:%.*]] = load i64, i64* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0
+// CHECK:   ret <1 x i64> [[VLD1_LANE]]
 uint64x1_t test_vld1_lane_u64(uint64_t  *a, uint64x1_t b) {
-  // CHECK-LABEL: test_vld1_lane_u64
   return vld1_lane_u64(a, b, 0);
-  // CHECK: {{ld1r { v[0-9]+.1d }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vld1_lane_s8(i8* %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
+// CHECK:   ret <8 x i8> [[VLD1_LANE]]
 int8x8_t test_vld1_lane_s8(int8_t  *a, int8x8_t b) {
-  // CHECK-LABEL: test_vld1_lane_s8
   return vld1_lane_s8(a, b, 7);
-  // CHECK: ld1 {{{ *v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vld1_lane_s16(i16* %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
+// CHECK:   ret <4 x i16> [[VLD1_LANE]]
 int16x4_t test_vld1_lane_s16(int16_t  *a, int16x4_t b) {
-  // CHECK-LABEL: test_vld1_lane_s16
   return vld1_lane_s16(a, b, 3);
-  // CHECK: ld1 {{{ *v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vld1_lane_s32(i32* %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP4:%.*]] = load i32, i32* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i32 1
+// CHECK:   ret <2 x i32> [[VLD1_LANE]]
 int32x2_t test_vld1_lane_s32(int32_t  *a, int32x2_t b) {
-  // CHECK-LABEL: test_vld1_lane_s32
   return vld1_lane_s32(a, b, 1);
-  // CHECK: ld1 {{{ *v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vld1_lane_s64(i64* %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP4:%.*]] = load i64, i64* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0
+// CHECK:   ret <1 x i64> [[VLD1_LANE]]
 int64x1_t test_vld1_lane_s64(int64_t  *a, int64x1_t b) {
-  // CHECK-LABEL: test_vld1_lane_s64
   return vld1_lane_s64(a, b, 0);
-  // CHECK: {{ld1r { v[0-9]+.1d }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x half> @test_vld1_lane_f16(half* %a, <4 x half> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[VLD1_LANE]] to <4 x half>
+// CHECK:   ret <4 x half> [[TMP5]]
 float16x4_t test_vld1_lane_f16(float16_t  *a, float16x4_t b) {
-  // CHECK-LABEL: test_vld1_lane_f16
   return vld1_lane_f16(a, b, 3);
-  // CHECK: ld1 {{{ *v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vld1_lane_f32(float* %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to float*
+// CHECK:   [[TMP4:%.*]] = load float, float* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP4]], i32 1
+// CHECK:   ret <2 x float> [[VLD1_LANE]]
 float32x2_t test_vld1_lane_f32(float32_t  *a, float32x2_t b) {
-  // CHECK-LABEL: test_vld1_lane_f32
   return vld1_lane_f32(a, b, 1);
-  // CHECK: ld1 {{{ *v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <1 x double> @test_vld1_lane_f64(double* %a, <1 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to double*
+// CHECK:   [[TMP4:%.*]] = load double, double* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <1 x double> [[TMP2]], double [[TMP4]], i32 0
+// CHECK:   ret <1 x double> [[VLD1_LANE]]
 float64x1_t test_vld1_lane_f64(float64_t  *a, float64x1_t b) {
-  // CHECK-LABEL: test_vld1_lane_f64
   return vld1_lane_f64(a, b, 0);
-  // CHECK: {{ld1r { v[0-9]+.1d }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vld1_lane_p8(i8* %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
+// CHECK:   ret <8 x i8> [[VLD1_LANE]]
 poly8x8_t test_vld1_lane_p8(poly8_t  *a, poly8x8_t b) {
-  // CHECK-LABEL: test_vld1_lane_p8
   return vld1_lane_p8(a, b, 7);
-  // CHECK: ld1 {{{ *v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vld1_lane_p16(i16* %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
+// CHECK:   ret <4 x i16> [[VLD1_LANE]]
 poly16x4_t test_vld1_lane_p16(poly16_t  *a, poly16x4_t b) {
-  // CHECK-LABEL: test_vld1_lane_p16
   return vld1_lane_p16(a, b, 3);
-  // CHECK: ld1 {{{ *v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vld1_lane_p64(i64* %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP4:%.*]] = load i64, i64* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0
+// CHECK:   ret <1 x i64> [[VLD1_LANE]]
 poly64x1_t test_vld1_lane_p64(poly64_t  *a, poly64x1_t b) {
-  // CHECK-LABEL: test_vld1_lane_p64
   return vld1_lane_p64(a, b, 0);
-  // CHECK: {{ld1r { v[0-9]+.1d }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x16x2_t @test_vld2q_lane_s8(i8* %ptr, [2 x <16 x i8>] %src.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[SRC:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[SRC]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[SRC]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x2_t* [[SRC]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[VLD2_LANE:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, i8* %ptr)
+// CHECK:   [[TMP5:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2_LANE]], { <16 x i8>, <16 x i8> }* [[TMP5]]
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP8:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x2_t [[TMP8]]
 int8x16x2_t test_vld2q_lane_s8(int8_t const * ptr, int8x16x2_t src) {
-  // CHECK-LABEL: test_vld2q_lane_s8
   return vld2q_lane_s8(ptr, src, 15);
-  // CHECK: ld2 {{{ *v[0-9]+.b,  v[0-9]+.b *}}}[15], [x0]
 }
 
+// CHECK-LABEL: define %struct.uint8x16x2_t @test_vld2q_lane_u8(i8* %ptr, [2 x <16 x i8>] %src.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[SRC:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[SRC]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[SRC]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x2_t* [[SRC]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[VLD2_LANE:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, i8* %ptr)
+// CHECK:   [[TMP5:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2_LANE]], { <16 x i8>, <16 x i8> }* [[TMP5]]
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP8:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x2_t [[TMP8]]
 uint8x16x2_t test_vld2q_lane_u8(uint8_t const * ptr, uint8x16x2_t src) {
-  // CHECK-LABEL: test_vld2q_lane_u8
   return vld2q_lane_u8(ptr, src, 15);
-  // CHECK: ld2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[15], [x0]
 }
 
+// CHECK-LABEL: define %struct.poly8x16x2_t @test_vld2q_lane_p8(i8* %ptr, [2 x <16 x i8>] %src.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[SRC:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[SRC]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[SRC]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x2_t* [[SRC]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[VLD2_LANE:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, i8* %ptr)
+// CHECK:   [[TMP5:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2_LANE]], { <16 x i8>, <16 x i8> }* [[TMP5]]
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP8:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x2_t [[TMP8]]
 poly8x16x2_t test_vld2q_lane_p8(poly8_t const * ptr, poly8x16x2_t src) {
-  // CHECK-LABEL: test_vld2q_lane_p8
   return vld2q_lane_p8(ptr, src, 15);
-  // CHECK: ld2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[15], [x0]
 }
 
+// CHECK-LABEL: define %struct.int8x16x3_t @test_vld3q_lane_s8(i8* %ptr, [3 x <16 x i8>] %src.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[SRC:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[SRC]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[SRC]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x3_t* [[SRC]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[VLD3_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, i8* %ptr)
+// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int8x16x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP9:%.*]] = load %struct.int8x16x3_t, %struct.int8x16x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x3_t [[TMP9]]
 int8x16x3_t test_vld3q_lane_s8(int8_t const * ptr, int8x16x3_t src) {
-  // CHECK-LABEL: test_vld3q_lane_s8
   return vld3q_lane_s8(ptr, src, 15);
-  // CHECK: ld3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [x0]
 }
 
+// CHECK-LABEL: define %struct.uint8x16x3_t @test_vld3q_lane_u8(i8* %ptr, [3 x <16 x i8>] %src.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[SRC:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[SRC]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[SRC]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x3_t* [[SRC]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[VLD3_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, i8* %ptr)
+// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint8x16x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP9:%.*]] = load %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x3_t [[TMP9]]
 uint8x16x3_t test_vld3q_lane_u8(uint8_t const * ptr, uint8x16x3_t src) {
-  // CHECK-LABEL: test_vld3q_lane_u8
   return vld3q_lane_u8(ptr, src, 15);
-  // CHECK: ld3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [x0]
 }
 
+// CHECK-LABEL: define %struct.uint16x8x2_t @test_vld2q_lane_u16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i8(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], { <8 x i16>, <8 x i16> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x2_t [[TMP1]]3
 uint16x8x2_t test_vld2q_lane_u16(uint16_t  *a, uint16x8x2_t b) {
-  // CHECK-LABEL: test_vld2q_lane_u16
   return vld2q_lane_u16(a, b, 7);
-  // CHECK: ld2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x4x2_t @test_vld2q_lane_u32(i32* %a, [2 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i8(<4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2_LANE]], { <4 x i32>, <4 x i32> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x2_t [[TMP1]]3
 uint32x4x2_t test_vld2q_lane_u32(uint32_t  *a, uint32x4x2_t b) {
-  // CHECK-LABEL: test_vld2q_lane_u32
   return vld2q_lane_u32(a, b, 3);
-  // CHECK: ld2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x2x2_t @test_vld2q_lane_u64(i64* %a, [2 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0i8(<2 x i64> [[TMP8]], <2 x i64> [[TMP9]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2_LANE]], { <2 x i64>, <2 x i64> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.uint64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint64x2x2_t [[TMP1]]3
 uint64x2x2_t test_vld2q_lane_u64(uint64_t  *a, uint64x2x2_t b) {
-  // CHECK-LABEL: test_vld2q_lane_u64
   return vld2q_lane_u64(a, b, 1);
-  // CHECK: ld2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x8x2_t @test_vld2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i8(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], { <8 x i16>, <8 x i16> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x2_t [[TMP1]]3
 int16x8x2_t test_vld2q_lane_s16(int16_t  *a, int16x8x2_t b) {
-  // CHECK-LABEL: test_vld2q_lane_s16
   return vld2q_lane_s16(a, b, 7);
-  // CHECK: ld2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x4x2_t @test_vld2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i8(<4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2_LANE]], { <4 x i32>, <4 x i32> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x2_t [[TMP1]]3
 int32x4x2_t test_vld2q_lane_s32(int32_t  *a, int32x4x2_t b) {
-  // CHECK-LABEL: test_vld2q_lane_s32
   return vld2q_lane_s32(a, b, 3);
-  // CHECK: ld2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x2x2_t @test_vld2q_lane_s64(i64* %a, [2 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0i8(<2 x i64> [[TMP8]], <2 x i64> [[TMP9]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2_LANE]], { <2 x i64>, <2 x i64> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.int64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.int64x2x2_t, %struct.int64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int64x2x2_t [[TMP1]]3
 int64x2x2_t test_vld2q_lane_s64(int64_t  *a, int64x2x2_t b) {
-  // CHECK-LABEL: test_vld2q_lane_s64
   return vld2q_lane_s64(a, b, 1);
-  // CHECK: ld2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x8x2_t @test_vld2q_lane_f16(half* %a, [2 x <8 x half>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x half>] [[B]].coerce, [2 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i8(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], { <8 x i16>, <8 x i16> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.float16x8x2_t, %struct.float16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float16x8x2_t [[TMP1]]3
 float16x8x2_t test_vld2q_lane_f16(float16_t  *a, float16x8x2_t b) {
-  // CHECK-LABEL: test_vld2q_lane_f16
   return vld2q_lane_f16(a, b, 7);
-  // CHECK: ld2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x4x2_t @test_vld2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x float>] [[B]].coerce, [2 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2lane.v4f32.p0i8(<4 x float> [[TMP8]], <4 x float> [[TMP9]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float> } [[VLD2_LANE]], { <4 x float>, <4 x float> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x2_t [[TMP1]]3
 float32x4x2_t test_vld2q_lane_f32(float32_t  *a, float32x4x2_t b) {
-  // CHECK-LABEL: test_vld2q_lane_f32
   return vld2q_lane_f32(a, b, 3);
-  // CHECK: ld2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x2x2_t @test_vld2q_lane_f64(double* %a, [2 x <2 x double>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x double>] [[B]].coerce, [2 x <2 x double>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x double> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x double> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x double>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2lane.v2f64.p0i8(<2 x double> [[TMP8]], <2 x double> [[TMP9]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x double>, <2 x double> }*
+// CHECK:   store { <2 x double>, <2 x double> } [[VLD2_LANE]], { <2 x double>, <2 x double> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.float64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.float64x2x2_t, %struct.float64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float64x2x2_t [[TMP1]]3
 float64x2x2_t test_vld2q_lane_f64(float64_t  *a, float64x2x2_t b) {
-  // CHECK-LABEL: test_vld2q_lane_f64
   return vld2q_lane_f64(a, b, 1);
-  // CHECK: ld2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x8x2_t @test_vld2q_lane_p16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i8(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], { <8 x i16>, <8 x i16> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x2_t [[TMP1]]3
 poly16x8x2_t test_vld2q_lane_p16(poly16_t  *a, poly16x8x2_t b) {
-  // CHECK-LABEL: test_vld2q_lane_p16
   return vld2q_lane_p16(a, b, 7);
-  // CHECK: ld2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x2x2_t @test_vld2q_lane_p64(i64* %a, [2 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0i8(<2 x i64> [[TMP8]], <2 x i64> [[TMP9]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2_LANE]], { <2 x i64>, <2 x i64> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.poly64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly64x2x2_t [[TMP1]]3
 poly64x2x2_t test_vld2q_lane_p64(poly64_t  *a, poly64x2x2_t b) {
-  // CHECK-LABEL: test_vld2q_lane_p64
   return vld2q_lane_p64(a, b, 1);
-  // CHECK: ld2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x8x2_t @test_vld2_lane_u8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* %a)
+// CHECK:   [[TMP5:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], { <8 x i8>, <8 x i8> }* [[TMP5]]
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP8:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x2_t [[TMP8]]
 uint8x8x2_t test_vld2_lane_u8(uint8_t  *a, uint8x8x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_u8
   return vld2_lane_u8(a, b, 7);
-  // CHECK: ld2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x4x2_t @test_vld2_lane_u16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i8(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], { <4 x i16>, <4 x i16> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x2_t [[TMP1]]3
 uint16x4x2_t test_vld2_lane_u16(uint16_t  *a, uint16x4x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_u16
   return vld2_lane_u16(a, b, 3);
-  // CHECK: ld2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x2x2_t @test_vld2_lane_u32(i32* %a, [2 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0i8(<2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2_LANE]], { <2 x i32>, <2 x i32> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x2_t [[TMP1]]3
 uint32x2x2_t test_vld2_lane_u32(uint32_t  *a, uint32x2x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_u32
   return vld2_lane_u32(a, b, 1);
-  // CHECK: ld2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x1x2_t @test_vld2_lane_u64(i64* %a, [2 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0i8(<1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i64 0, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2_LANE]], { <1 x i64>, <1 x i64> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.uint64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint64x1x2_t [[TMP1]]3
 uint64x1x2_t test_vld2_lane_u64(uint64_t  *a, uint64x1x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_u64
   return vld2_lane_u64(a, b, 0);
-  // CHECK: ld2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x8x2_t @test_vld2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* %a)
+// CHECK:   [[TMP5:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], { <8 x i8>, <8 x i8> }* [[TMP5]]
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP8:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x2_t [[TMP8]]
 int8x8x2_t test_vld2_lane_s8(int8_t  *a, int8x8x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_s8
   return vld2_lane_s8(a, b, 7);
-  // CHECK: ld2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x4x2_t @test_vld2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i8(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], { <4 x i16>, <4 x i16> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x2_t [[TMP1]]3
 int16x4x2_t test_vld2_lane_s16(int16_t  *a, int16x4x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_s16
   return vld2_lane_s16(a, b, 3);
-  // CHECK: ld2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x2x2_t @test_vld2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0i8(<2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2_LANE]], { <2 x i32>, <2 x i32> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x2_t [[TMP1]]3
 int32x2x2_t test_vld2_lane_s32(int32_t  *a, int32x2x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_s32
   return vld2_lane_s32(a, b, 1);
-  // CHECK: ld2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x1x2_t @test_vld2_lane_s64(i64* %a, [2 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0i8(<1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i64 0, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2_LANE]], { <1 x i64>, <1 x i64> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.int64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.int64x1x2_t, %struct.int64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int64x1x2_t [[TMP1]]3
 int64x1x2_t test_vld2_lane_s64(int64_t  *a, int64x1x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_s64
   return vld2_lane_s64(a, b, 0);
-  // CHECK: ld2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x4x2_t @test_vld2_lane_f16(half* %a, [2 x <4 x half>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x half>] [[B]].coerce, [2 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i8(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], { <4 x i16>, <4 x i16> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.float16x4x2_t, %struct.float16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x2_t [[TMP1]]3
 float16x4x2_t test_vld2_lane_f16(float16_t  *a, float16x4x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_f16
   return vld2_lane_f16(a, b, 3);
-  // CHECK: ld2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x2x2_t @test_vld2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x float>] [[B]].coerce, [2 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2lane.v2f32.p0i8(<2 x float> [[TMP8]], <2 x float> [[TMP9]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float> } [[VLD2_LANE]], { <2 x float>, <2 x float> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x2_t [[TMP1]]3
 float32x2x2_t test_vld2_lane_f32(float32_t  *a, float32x2x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_f32
   return vld2_lane_f32(a, b, 1);
-  // CHECK: ld2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x1x2_t @test_vld2_lane_f64(double* %a, [2 x <1 x double>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x double>] [[B]].coerce, [2 x <1 x double>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x double> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x double> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x double>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2lane.v1f64.p0i8(<1 x double> [[TMP8]], <1 x double> [[TMP9]], i64 0, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <1 x double>, <1 x double> }*
+// CHECK:   store { <1 x double>, <1 x double> } [[VLD2_LANE]], { <1 x double>, <1 x double> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.float64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.float64x1x2_t, %struct.float64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float64x1x2_t [[TMP1]]3
 float64x1x2_t test_vld2_lane_f64(float64_t  *a, float64x1x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_f64
   return vld2_lane_f64(a, b, 0);
-  // CHECK: ld2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x8x2_t @test_vld2_lane_p8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* %a)
+// CHECK:   [[TMP5:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], { <8 x i8>, <8 x i8> }* [[TMP5]]
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP8:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x2_t [[TMP8]]
 poly8x8x2_t test_vld2_lane_p8(poly8_t  *a, poly8x8x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_p8
   return vld2_lane_p8(a, b, 7);
-  // CHECK: ld2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x4x2_t @test_vld2_lane_p16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i8(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], { <4 x i16>, <4 x i16> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x2_t [[TMP1]]3
 poly16x4x2_t test_vld2_lane_p16(poly16_t  *a, poly16x4x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_p16
   return vld2_lane_p16(a, b, 3);
-  // CHECK: ld2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x1x2_t @test_vld2_lane_p64(i64* %a, [2 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0i8(<1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i64 0, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2_LANE]], { <1 x i64>, <1 x i64> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.poly64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly64x1x2_t [[TMP1]]3
 poly64x1x2_t test_vld2_lane_p64(poly64_t  *a, poly64x1x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_p64
   return vld2_lane_p64(a, b, 0);
-  // CHECK: ld2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x8x3_t @test_vld3q_lane_u16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i8(<8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.uint16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x3_t [[TMP1]]6
 uint16x8x3_t test_vld3q_lane_u16(uint16_t  *a, uint16x8x3_t b) {
-  // CHECK-LABEL: test_vld3q_lane_u16
   return vld3q_lane_u16(a, b, 7);
-  // CHECK: ld3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x4x3_t @test_vld3q_lane_u32(i32* %a, [3 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i8(<4 x i32> [[TMP1]]0, <4 x i32> [[TMP1]]1, <4 x i32> [[TMP1]]2, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.uint32x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x3_t [[TMP1]]6
 uint32x4x3_t test_vld3q_lane_u32(uint32_t  *a, uint32x4x3_t b) {
-  // CHECK-LABEL: test_vld3q_lane_u32
   return vld3q_lane_u32(a, b, 3);
-  // CHECK: ld3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x2x3_t @test_vld3q_lane_u64(i64* %a, [3 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0i8(<2 x i64> [[TMP1]]0, <2 x i64> [[TMP1]]1, <2 x i64> [[TMP1]]2, i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.uint64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint64x2x3_t [[TMP1]]6
 uint64x2x3_t test_vld3q_lane_u64(uint64_t  *a, uint64x2x3_t b) {
-  // CHECK-LABEL: test_vld3q_lane_u64
   return vld3q_lane_u64(a, b, 1);
-  // CHECK: ld3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x8x3_t @test_vld3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i8(<8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.int16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.int16x8x3_t, %struct.int16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x3_t [[TMP1]]6
 int16x8x3_t test_vld3q_lane_s16(int16_t  *a, int16x8x3_t b) {
-  // CHECK-LABEL: test_vld3q_lane_s16
   return vld3q_lane_s16(a, b, 7);
-  // CHECK: ld3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x4x3_t @test_vld3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i8(<4 x i32> [[TMP1]]0, <4 x i32> [[TMP1]]1, <4 x i32> [[TMP1]]2, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.int32x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.int32x4x3_t, %struct.int32x4x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x3_t [[TMP1]]6
 int32x4x3_t test_vld3q_lane_s32(int32_t  *a, int32x4x3_t b) {
-  // CHECK-LABEL: test_vld3q_lane_s32
   return vld3q_lane_s32(a, b, 3);
-  // CHECK: ld3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x2x3_t @test_vld3q_lane_s64(i64* %a, [3 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0i8(<2 x i64> [[TMP1]]0, <2 x i64> [[TMP1]]1, <2 x i64> [[TMP1]]2, i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.int64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.int64x2x3_t, %struct.int64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int64x2x3_t [[TMP1]]6
 int64x2x3_t test_vld3q_lane_s64(int64_t  *a, int64x2x3_t b) {
-  // CHECK-LABEL: test_vld3q_lane_s64
   return vld3q_lane_s64(a, b, 1);
-  // CHECK: ld3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x8x3_t @test_vld3q_lane_f16(half* %a, [3 x <8 x half>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x half>] [[B]].coerce, [3 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i8(<8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.float16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.float16x8x3_t, %struct.float16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float16x8x3_t [[TMP1]]6
 float16x8x3_t test_vld3q_lane_f16(float16_t  *a, float16x8x3_t b) {
-  // CHECK-LABEL: test_vld3q_lane_f16
   return vld3q_lane_f16(a, b, 7);
-  // CHECK: ld3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x4x3_t @test_vld3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x float>] [[B]].coerce, [3 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3lane.v4f32.p0i8(<4 x float> [[TMP1]]0, <4 x float> [[TMP1]]1, <4 x float> [[TMP1]]2, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x float>, <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float>, <4 x float> } [[VLD3_LANE]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.float32x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.float32x4x3_t, %struct.float32x4x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x3_t [[TMP1]]6
 float32x4x3_t test_vld3q_lane_f32(float32_t  *a, float32x4x3_t b) {
-  // CHECK-LABEL: test_vld3q_lane_f32
   return vld3q_lane_f32(a, b, 3);
-  // CHECK: ld3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x2x3_t @test_vld3q_lane_f64(double* %a, [3 x <2 x double>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x double>] [[B]].coerce, [3 x <2 x double>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x double> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x double> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x double> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x double>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x double>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3lane.v2f64.p0i8(<2 x double> [[TMP1]]0, <2 x double> [[TMP1]]1, <2 x double> [[TMP1]]2, i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x double>, <2 x double>, <2 x double> }*
+// CHECK:   store { <2 x double>, <2 x double>, <2 x double> } [[VLD3_LANE]], { <2 x double>, <2 x double>, <2 x double> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.float64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.float64x2x3_t, %struct.float64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float64x2x3_t [[TMP1]]6
 float64x2x3_t test_vld3q_lane_f64(float64_t  *a, float64x2x3_t b) {
-  // CHECK-LABEL: test_vld3q_lane_f64
   return vld3q_lane_f64(a, b, 1);
-  // CHECK: ld3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x16x3_t @test_vld3q_lane_p8(i8* %a, [3 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[VLD3_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, i8* %a)
+// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly8x16x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP9:%.*]] = load %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x3_t [[TMP9]]
 poly8x16x3_t test_vld3q_lane_p8(poly8_t  *a, poly8x16x3_t b) {
-  // CHECK-LABEL: test_vld3q_lane_p8
   return vld3q_lane_p8(a, b, 15);
-  // CHECK: ld3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x8x3_t @test_vld3q_lane_p16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i8(<8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.poly16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x3_t [[TMP1]]6
 poly16x8x3_t test_vld3q_lane_p16(poly16_t  *a, poly16x8x3_t b) {
-  // CHECK-LABEL: test_vld3q_lane_p16
   return vld3q_lane_p16(a, b, 7);
-  // CHECK: ld3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x2x3_t @test_vld3q_lane_p64(i64* %a, [3 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0i8(<2 x i64> [[TMP1]]0, <2 x i64> [[TMP1]]1, <2 x i64> [[TMP1]]2, i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.poly64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly64x2x3_t [[TMP1]]6
 poly64x2x3_t test_vld3q_lane_p64(poly64_t  *a, poly64x2x3_t b) {
-  // CHECK-LABEL: test_vld3q_lane_p64
   return vld3q_lane_p64(a, b, 1);
-  // CHECK: ld3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x8x3_t @test_vld3_lane_u8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* %a)
+// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP9:%.*]] = load %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x3_t [[TMP9]]
 uint8x8x3_t test_vld3_lane_u8(uint8_t  *a, uint8x8x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_u8
   return vld3_lane_u8(a, b, 7);
-  // CHECK: ld3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x4x3_t @test_vld3_lane_u16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i8(<4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.uint16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x3_t [[TMP1]]6
 uint16x4x3_t test_vld3_lane_u16(uint16_t  *a, uint16x4x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_u16
   return vld3_lane_u16(a, b, 3);
-  // CHECK: ld3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x2x3_t @test_vld3_lane_u32(i32* %a, [3 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0i8(<2 x i32> [[TMP1]]0, <2 x i32> [[TMP1]]1, <2 x i32> [[TMP1]]2, i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.uint32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x3_t [[TMP1]]6
 uint32x2x3_t test_vld3_lane_u32(uint32_t  *a, uint32x2x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_u32
   return vld3_lane_u32(a, b, 1);
-  // CHECK: ld3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x1x3_t @test_vld3_lane_u64(i64* %a, [3 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0i8(<1 x i64> [[TMP1]]0, <1 x i64> [[TMP1]]1, <1 x i64> [[TMP1]]2, i64 0, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.uint64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint64x1x3_t [[TMP1]]6
 uint64x1x3_t test_vld3_lane_u64(uint64_t  *a, uint64x1x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_u64
   return vld3_lane_u64(a, b, 0);
-  // CHECK: ld3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x8x3_t @test_vld3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* %a)
+// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP9:%.*]] = load %struct.int8x8x3_t, %struct.int8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x3_t [[TMP9]]
 int8x8x3_t test_vld3_lane_s8(int8_t  *a, int8x8x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_s8
   return vld3_lane_s8(a, b, 7);
-  // CHECK: ld3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x4x3_t @test_vld3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i8(<4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.int16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.int16x4x3_t, %struct.int16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x3_t [[TMP1]]6
 int16x4x3_t test_vld3_lane_s16(int16_t  *a, int16x4x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_s16
   return vld3_lane_s16(a, b, 3);
-  // CHECK: ld3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x2x3_t @test_vld3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0i8(<2 x i32> [[TMP1]]0, <2 x i32> [[TMP1]]1, <2 x i32> [[TMP1]]2, i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.int32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.int32x2x3_t, %struct.int32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x3_t [[TMP1]]6
 int32x2x3_t test_vld3_lane_s32(int32_t  *a, int32x2x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_s32
   return vld3_lane_s32(a, b, 1);
-  // CHECK: ld3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x1x3_t @test_vld3_lane_s64(i64* %a, [3 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0i8(<1 x i64> [[TMP1]]0, <1 x i64> [[TMP1]]1, <1 x i64> [[TMP1]]2, i64 0, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.int64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.int64x1x3_t, %struct.int64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int64x1x3_t [[TMP1]]6
 int64x1x3_t test_vld3_lane_s64(int64_t  *a, int64x1x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_s64
   return vld3_lane_s64(a, b, 0);
-  // CHECK: ld3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x4x3_t @test_vld3_lane_f16(half* %a, [3 x <4 x half>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x half>] [[B]].coerce, [3 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i8(<4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.float16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.float16x4x3_t, %struct.float16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x3_t [[TMP1]]6
 float16x4x3_t test_vld3_lane_f16(float16_t  *a, float16x4x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_f16
   return vld3_lane_f16(a, b, 3);
-  // CHECK: ld3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x2x3_t @test_vld3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x float>] [[B]].coerce, [3 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3lane.v2f32.p0i8(<2 x float> [[TMP1]]0, <2 x float> [[TMP1]]1, <2 x float> [[TMP1]]2, i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x float>, <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float>, <2 x float> } [[VLD3_LANE]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.float32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.float32x2x3_t, %struct.float32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x3_t [[TMP1]]6
 float32x2x3_t test_vld3_lane_f32(float32_t  *a, float32x2x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_f32
   return vld3_lane_f32(a, b, 1);
-  // CHECK: ld3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x1x3_t @test_vld3_lane_f64(double* %a, [3 x <1 x double>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x double>] [[B]].coerce, [3 x <1 x double>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x double> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x double> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <1 x double> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x double>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x double>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3lane.v1f64.p0i8(<1 x double> [[TMP1]]0, <1 x double> [[TMP1]]1, <1 x double> [[TMP1]]2, i64 0, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <1 x double>, <1 x double>, <1 x double> }*
+// CHECK:   store { <1 x double>, <1 x double>, <1 x double> } [[VLD3_LANE]], { <1 x double>, <1 x double>, <1 x double> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.float64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.float64x1x3_t, %struct.float64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float64x1x3_t [[TMP1]]6
 float64x1x3_t test_vld3_lane_f64(float64_t  *a, float64x1x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_f64
   return vld3_lane_f64(a, b, 0);
-  // CHECK: ld3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x8x3_t @test_vld3_lane_p8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* %a)
+// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP9:%.*]] = load %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x3_t [[TMP9]]
 poly8x8x3_t test_vld3_lane_p8(poly8_t  *a, poly8x8x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_p8
   return vld3_lane_p8(a, b, 7);
-  // CHECK: ld3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x4x3_t @test_vld3_lane_p16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i8(<4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.poly16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x3_t [[TMP1]]6
 poly16x4x3_t test_vld3_lane_p16(poly16_t  *a, poly16x4x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_p16
   return vld3_lane_p16(a, b, 3);
-  // CHECK: ld3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x1x3_t @test_vld3_lane_p64(i64* %a, [3 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0i8(<1 x i64> [[TMP1]]0, <1 x i64> [[TMP1]]1, <1 x i64> [[TMP1]]2, i64 0, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.poly64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly64x1x3_t [[TMP1]]6
 poly64x1x3_t test_vld3_lane_p64(poly64_t  *a, poly64x1x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_p64
   return vld3_lane_p64(a, b, 0);
-  // CHECK: ld3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x16x4_t @test_vld4q_lane_u8(i8* %a, [4 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[VLD4_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i64 15, i8* %a)
+// CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP7]]
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint8x16x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP9:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP10:%.*]] = load %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x4_t [[TMP1]]0
 uint8x16x4_t test_vld4q_lane_u8(uint8_t  *a, uint8x16x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_u8
   return vld4q_lane_u8(a, b, 15);
-  // CHECK: ld4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x8x4_t @test_vld4q_lane_u16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP1]]0 to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP1]]1 to <8 x i16>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i8(<8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, <8 x i16> [[TMP1]]5, i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.uint16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x4_t [[TMP1]]9
 uint16x8x4_t test_vld4q_lane_u16(uint16_t  *a, uint16x8x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_u16
   return vld4q_lane_u16(a, b, 7);
-  // CHECK: ld4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x4x4_t @test_vld4q_lane_u32(i32* %a, [4 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP1]]0 to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP1]]1 to <4 x i32>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i8(<4 x i32> [[TMP1]]2, <4 x i32> [[TMP1]]3, <4 x i32> [[TMP1]]4, <4 x i32> [[TMP1]]5, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.uint32x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x4_t [[TMP1]]9
 uint32x4x4_t test_vld4q_lane_u32(uint32_t  *a, uint32x4x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_u32
   return vld4q_lane_u32(a, b, 3);
-  // CHECK: ld4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x2x4_t @test_vld4q_lane_u64(i64* %a, [4 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <2 x i64> [[TMP1]]0 to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP1]]1 to <2 x i64>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0i8(<2 x i64> [[TMP1]]2, <2 x i64> [[TMP1]]3, <2 x i64> [[TMP1]]4, <2 x i64> [[TMP1]]5, i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.uint64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint64x2x4_t [[TMP1]]9
 uint64x2x4_t test_vld4q_lane_u64(uint64_t  *a, uint64x2x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_u64
   return vld4q_lane_u64(a, b, 1);
-  // CHECK: ld4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x16x4_t @test_vld4q_lane_s8(i8* %a, [4 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[VLD4_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i64 15, i8* %a)
+// CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP7]]
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int8x16x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP9:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP10:%.*]] = load %struct.int8x16x4_t, %struct.int8x16x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x4_t [[TMP1]]0
 int8x16x4_t test_vld4q_lane_s8(int8_t  *a, int8x16x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_s8
   return vld4q_lane_s8(a, b, 15);
-  // CHECK: ld4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x8x4_t @test_vld4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP1]]0 to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP1]]1 to <8 x i16>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i8(<8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, <8 x i16> [[TMP1]]5, i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.int16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.int16x8x4_t, %struct.int16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x4_t [[TMP1]]9
 int16x8x4_t test_vld4q_lane_s16(int16_t  *a, int16x8x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_s16
   return vld4q_lane_s16(a, b, 7);
-  // CHECK: ld4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x4x4_t @test_vld4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP1]]0 to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP1]]1 to <4 x i32>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i8(<4 x i32> [[TMP1]]2, <4 x i32> [[TMP1]]3, <4 x i32> [[TMP1]]4, <4 x i32> [[TMP1]]5, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.int32x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.int32x4x4_t, %struct.int32x4x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x4_t [[TMP1]]9
 int32x4x4_t test_vld4q_lane_s32(int32_t  *a, int32x4x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_s32
   return vld4q_lane_s32(a, b, 3);
-  // CHECK: ld4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x2x4_t @test_vld4q_lane_s64(i64* %a, [4 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <2 x i64> [[TMP1]]0 to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP1]]1 to <2 x i64>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0i8(<2 x i64> [[TMP1]]2, <2 x i64> [[TMP1]]3, <2 x i64> [[TMP1]]4, <2 x i64> [[TMP1]]5, i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.int64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.int64x2x4_t, %struct.int64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int64x2x4_t [[TMP1]]9
 int64x2x4_t test_vld4q_lane_s64(int64_t  *a, int64x2x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_s64
   return vld4q_lane_s64(a, b, 1);
-  // CHECK: ld4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x8x4_t @test_vld4q_lane_f16(half* %a, [4 x <8 x half>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x half>] [[B]].coerce, [4 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x half> [[TMP1]]0 to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP1]]1 to <8 x i16>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i8(<8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, <8 x i16> [[TMP1]]5, i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.float16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.float16x8x4_t, %struct.float16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float16x8x4_t [[TMP1]]9
 float16x8x4_t test_vld4q_lane_f16(float16_t  *a, float16x8x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_f16
   return vld4q_lane_f16(a, b, 7);
-  // CHECK: ld4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x4x4_t @test_vld4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x float>] [[B]].coerce, [4 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x float> [[TMP1]]0 to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP1]]1 to <4 x float>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4lane.v4f32.p0i8(<4 x float> [[TMP1]]2, <4 x float> [[TMP1]]3, <4 x float> [[TMP1]]4, <4 x float> [[TMP1]]5, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x float>, <4 x float>, <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4_LANE]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.float32x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.float32x4x4_t, %struct.float32x4x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x4_t [[TMP1]]9
 float32x4x4_t test_vld4q_lane_f32(float32_t  *a, float32x4x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_f32
   return vld4q_lane_f32(a, b, 3);
-  // CHECK: ld4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x2x4_t @test_vld4q_lane_f64(double* %a, [4 x <2 x double>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x double>] [[B]].coerce, [4 x <2 x double>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x double> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x double> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x double> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <2 x double> [[TMP1]]0 to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x double>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x double>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP1]]1 to <2 x double>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4lane.v2f64.p0i8(<2 x double> [[TMP1]]2, <2 x double> [[TMP1]]3, <2 x double> [[TMP1]]4, <2 x double> [[TMP1]]5, i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x double>, <2 x double>, <2 x double>, <2 x double> }*
+// CHECK:   store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4_LANE]], { <2 x double>, <2 x double>, <2 x double>, <2 x double> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.float64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.float64x2x4_t, %struct.float64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float64x2x4_t [[TMP1]]9
 float64x2x4_t test_vld4q_lane_f64(float64_t  *a, float64x2x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_f64
   return vld4q_lane_f64(a, b, 1);
-  // CHECK: ld4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x16x4_t @test_vld4q_lane_p8(i8* %a, [4 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[VLD4_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i64 15, i8* %a)
+// CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP7]]
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly8x16x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP9:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP10:%.*]] = load %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x4_t [[TMP1]]0
 poly8x16x4_t test_vld4q_lane_p8(poly8_t  *a, poly8x16x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_p8
   return vld4q_lane_p8(a, b, 15);
-  // CHECK: ld4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x8x4_t @test_vld4q_lane_p16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP1]]0 to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP1]]1 to <8 x i16>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i8(<8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, <8 x i16> [[TMP1]]5, i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.poly16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x4_t [[TMP1]]9
 poly16x8x4_t test_vld4q_lane_p16(poly16_t  *a, poly16x8x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_p16
   return vld4q_lane_p16(a, b, 7);
-  // CHECK: ld4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x2x4_t @test_vld4q_lane_p64(i64* %a, [4 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <2 x i64> [[TMP1]]0 to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP1]]1 to <2 x i64>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0i8(<2 x i64> [[TMP1]]2, <2 x i64> [[TMP1]]3, <2 x i64> [[TMP1]]4, <2 x i64> [[TMP1]]5, i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.poly64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly64x2x4_t [[TMP1]]9
 poly64x2x4_t test_vld4q_lane_p64(poly64_t  *a, poly64x2x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_p64
   return vld4q_lane_p64(a, b, 1);
-  // CHECK: ld4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x8x4_t @test_vld4_lane_u8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i64 7, i8* %a)
+// CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP9:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP10:%.*]] = load %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x4_t [[TMP1]]0
 uint8x8x4_t test_vld4_lane_u8(uint8_t  *a, uint8x8x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_u8
   return vld4_lane_u8(a, b, 7);
-  // CHECK: ld4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x4x4_t @test_vld4_lane_u16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP1]]0 to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP1]]1 to <4 x i16>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i8(<4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, <4 x i16> [[TMP1]]5, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.uint16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x4_t [[TMP1]]9
 uint16x4x4_t test_vld4_lane_u16(uint16_t  *a, uint16x4x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_u16
   return vld4_lane_u16(a, b, 3);
-  // CHECK: ld4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x2x4_t @test_vld4_lane_u32(i32* %a, [4 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <2 x i32> [[TMP1]]0 to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP1]]1 to <2 x i32>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0i8(<2 x i32> [[TMP1]]2, <2 x i32> [[TMP1]]3, <2 x i32> [[TMP1]]4, <2 x i32> [[TMP1]]5, i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.uint32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x4_t [[TMP1]]9
 uint32x2x4_t test_vld4_lane_u32(uint32_t  *a, uint32x2x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_u32
   return vld4_lane_u32(a, b, 1);
-  // CHECK: ld4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x1x4_t @test_vld4_lane_u64(i64* %a, [4 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <1 x i64> [[TMP1]]0 to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP1]]1 to <1 x i64>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0i8(<1 x i64> [[TMP1]]2, <1 x i64> [[TMP1]]3, <1 x i64> [[TMP1]]4, <1 x i64> [[TMP1]]5, i64 0, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.uint64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint64x1x4_t [[TMP1]]9
 uint64x1x4_t test_vld4_lane_u64(uint64_t  *a, uint64x1x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_u64
   return vld4_lane_u64(a, b, 0);
-  // CHECK: ld4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x8x4_t @test_vld4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i64 7, i8* %a)
+// CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP9:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP10:%.*]] = load %struct.int8x8x4_t, %struct.int8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x4_t [[TMP1]]0
 int8x8x4_t test_vld4_lane_s8(int8_t  *a, int8x8x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_s8
   return vld4_lane_s8(a, b, 7);
-  // CHECK: ld4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x4x4_t @test_vld4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP1]]0 to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP1]]1 to <4 x i16>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i8(<4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, <4 x i16> [[TMP1]]5, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.int16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.int16x4x4_t, %struct.int16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x4_t [[TMP1]]9
 int16x4x4_t test_vld4_lane_s16(int16_t  *a, int16x4x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_s16
   return vld4_lane_s16(a, b, 3);
-  // CHECK: ld4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x2x4_t @test_vld4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <2 x i32> [[TMP1]]0 to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP1]]1 to <2 x i32>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0i8(<2 x i32> [[TMP1]]2, <2 x i32> [[TMP1]]3, <2 x i32> [[TMP1]]4, <2 x i32> [[TMP1]]5, i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.int32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.int32x2x4_t, %struct.int32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x4_t [[TMP1]]9
 int32x2x4_t test_vld4_lane_s32(int32_t  *a, int32x2x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_s32
   return vld4_lane_s32(a, b, 1);
-  // CHECK: ld4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x1x4_t @test_vld4_lane_s64(i64* %a, [4 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <1 x i64> [[TMP1]]0 to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP1]]1 to <1 x i64>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0i8(<1 x i64> [[TMP1]]2, <1 x i64> [[TMP1]]3, <1 x i64> [[TMP1]]4, <1 x i64> [[TMP1]]5, i64 0, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.int64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.int64x1x4_t, %struct.int64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int64x1x4_t [[TMP1]]9
 int64x1x4_t test_vld4_lane_s64(int64_t  *a, int64x1x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_s64
   return vld4_lane_s64(a, b, 0);
-  // CHECK: ld4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x4x4_t @test_vld4_lane_f16(half* %a, [4 x <4 x half>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x half>] [[B]].coerce, [4 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x half> [[TMP1]]0 to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP1]]1 to <4 x i16>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i8(<4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, <4 x i16> [[TMP1]]5, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.float16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.float16x4x4_t, %struct.float16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x4_t [[TMP1]]9
 float16x4x4_t test_vld4_lane_f16(float16_t  *a, float16x4x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_f16
   return vld4_lane_f16(a, b, 3);
-  // CHECK: ld4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x2x4_t @test_vld4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x float>] [[B]].coerce, [4 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <2 x float> [[TMP1]]0 to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP1]]1 to <2 x float>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4lane.v2f32.p0i8(<2 x float> [[TMP1]]2, <2 x float> [[TMP1]]3, <2 x float> [[TMP1]]4, <2 x float> [[TMP1]]5, i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_LANE]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.float32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.float32x2x4_t, %struct.float32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x4_t [[TMP1]]9
 float32x2x4_t test_vld4_lane_f32(float32_t  *a, float32x2x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_f32
   return vld4_lane_f32(a, b, 1);
-  // CHECK: ld4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x1x4_t @test_vld4_lane_f64(double* %a, [4 x <1 x double>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x double>] [[B]].coerce, [4 x <1 x double>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x double> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x double> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <1 x double> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <1 x double> [[TMP1]]0 to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x double>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x double>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP1]]1 to <1 x double>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4lane.v1f64.p0i8(<1 x double> [[TMP1]]2, <1 x double> [[TMP1]]3, <1 x double> [[TMP1]]4, <1 x double> [[TMP1]]5, i64 0, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <1 x double>, <1 x double>, <1 x double>, <1 x double> }*
+// CHECK:   store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4_LANE]], { <1 x double>, <1 x double>, <1 x double>, <1 x double> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.float64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.float64x1x4_t, %struct.float64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float64x1x4_t [[TMP1]]9
 float64x1x4_t test_vld4_lane_f64(float64_t  *a, float64x1x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_f64
   return vld4_lane_f64(a, b, 0);
-  // CHECK: ld4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x8x4_t @test_vld4_lane_p8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i64 7, i8* %a)
+// CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP9:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP10:%.*]] = load %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x4_t [[TMP1]]0
 poly8x8x4_t test_vld4_lane_p8(poly8_t  *a, poly8x8x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_p8
   return vld4_lane_p8(a, b, 7);
-  // CHECK: ld4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x4x4_t @test_vld4_lane_p16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP1]]0 to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP1]]1 to <4 x i16>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i8(<4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, <4 x i16> [[TMP1]]5, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.poly16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x4_t [[TMP1]]9
 poly16x4x4_t test_vld4_lane_p16(poly16_t  *a, poly16x4x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_p16
   return vld4_lane_p16(a, b, 3);
-  // CHECK: ld4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x1x4_t @test_vld4_lane_p64(i64* %a, [4 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <1 x i64> [[TMP1]]0 to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP1]]1 to <1 x i64>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0i8(<1 x i64> [[TMP1]]2, <1 x i64> [[TMP1]]3, <1 x i64> [[TMP1]]4, <1 x i64> [[TMP1]]5, i64 0, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.poly64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly64x1x4_t [[TMP1]]9
 poly64x1x4_t test_vld4_lane_p64(poly64_t  *a, poly64x1x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_p64
   return vld4_lane_p64(a, b, 0);
-  // CHECK: ld4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_u8(i8* %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
+// CHECK:   store i8 [[TMP0]], i8* %a
+// CHECK:   ret void
 void test_vst1q_lane_u8(uint8_t  *a, uint8x16_t b) {
-  // CHECK-LABEL: test_vst1q_lane_u8
   vst1q_lane_u8(a, b, 15);
-  // CHECK: st1 {{{ *v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_u16(i16* %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]]
+// CHECK:   ret void
 void test_vst1q_lane_u16(uint16_t  *a, uint16x8_t b) {
-  // CHECK-LABEL: test_vst1q_lane_u16
   vst1q_lane_u16(a, b, 7);
-  // CHECK: st1 {{{ *v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_u32(i32* %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   store i32 [[TMP3]], i32* [[TMP4]]
+// CHECK:   ret void
 void test_vst1q_lane_u32(uint32_t  *a, uint32x4_t b) {
-  // CHECK-LABEL: test_vst1q_lane_u32
   vst1q_lane_u32(a, b, 3);
-  // CHECK: st1 {{{ *v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_u64(i64* %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   store i64 [[TMP3]], i64* [[TMP4]]
+// CHECK:   ret void
 void test_vst1q_lane_u64(uint64_t  *a, uint64x2_t b) {
-  // CHECK-LABEL: test_vst1q_lane_u64
   vst1q_lane_u64(a, b, 1);
-  // CHECK: st1 {{{ *v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_s8(i8* %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
+// CHECK:   store i8 [[TMP0]], i8* %a
+// CHECK:   ret void
 void test_vst1q_lane_s8(int8_t  *a, int8x16_t b) {
-  // CHECK-LABEL: test_vst1q_lane_s8
   vst1q_lane_s8(a, b, 15);
-  // CHECK: st1 {{{ *v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_s16(i16* %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]]
+// CHECK:   ret void
 void test_vst1q_lane_s16(int16_t  *a, int16x8_t b) {
-  // CHECK-LABEL: test_vst1q_lane_s16
   vst1q_lane_s16(a, b, 7);
-  // CHECK: st1 {{{ *v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_s32(i32* %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   store i32 [[TMP3]], i32* [[TMP4]]
+// CHECK:   ret void
 void test_vst1q_lane_s32(int32_t  *a, int32x4_t b) {
-  // CHECK-LABEL: test_vst1q_lane_s32
   vst1q_lane_s32(a, b, 3);
-  // CHECK: st1 {{{ *v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_s64(i64* %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   store i64 [[TMP3]], i64* [[TMP4]]
+// CHECK:   ret void
 void test_vst1q_lane_s64(int64_t  *a, int64x2_t b) {
-  // CHECK-LABEL: test_vst1q_lane_s64
   vst1q_lane_s64(a, b, 1);
-  // CHECK: st1 {{{ *v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_f16(half* %a, <8 x half> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]]
+// CHECK:   ret void
 void test_vst1q_lane_f16(float16_t  *a, float16x8_t b) {
-  // CHECK-LABEL: test_vst1q_lane_f16
   vst1q_lane_f16(a, b, 7);
-  // CHECK: st1 {{{ *v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_f32(float* %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to float*
+// CHECK:   store float [[TMP3]], float* [[TMP4]]
+// CHECK:   ret void
 void test_vst1q_lane_f32(float32_t  *a, float32x4_t b) {
-  // CHECK-LABEL: test_vst1q_lane_f32
   vst1q_lane_f32(a, b, 3);
-  // CHECK: st1 {{{ *v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_f64(double* %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to double*
+// CHECK:   store double [[TMP3]], double* [[TMP4]]
+// CHECK:   ret void
 void test_vst1q_lane_f64(float64_t  *a, float64x2_t b) {
-  // CHECK-LABEL: test_vst1q_lane_f64
   vst1q_lane_f64(a, b, 1);
-  // CHECK: st1 {{{ *v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_p8(i8* %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
+// CHECK:   store i8 [[TMP0]], i8* %a
+// CHECK:   ret void
 void test_vst1q_lane_p8(poly8_t  *a, poly8x16_t b) {
-  // CHECK-LABEL: test_vst1q_lane_p8
   vst1q_lane_p8(a, b, 15);
-  // CHECK: st1 {{{ *v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_p16(i16* %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]]
+// CHECK:   ret void
 void test_vst1q_lane_p16(poly16_t  *a, poly16x8_t b) {
-  // CHECK-LABEL: test_vst1q_lane_p16
   vst1q_lane_p16(a, b, 7);
-  // CHECK: st1 {{{ *v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_p64(i64* %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   store i64 [[TMP3]], i64* [[TMP4]]
+// CHECK:   ret void
 void test_vst1q_lane_p64(poly64_t  *a, poly64x2_t b) {
-  // CHECK-LABEL: test_vst1q_lane_p64
   vst1q_lane_p64(a, b, 1);
-  // CHECK: st1 {{{ *v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_u8(i8* %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
+// CHECK:   store i8 [[TMP0]], i8* %a
+// CHECK:   ret void
 void test_vst1_lane_u8(uint8_t  *a, uint8x8_t b) {
-  // CHECK-LABEL: test_vst1_lane_u8
   vst1_lane_u8(a, b, 7);
-  // CHECK: st1 {{{ *v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_u16(i16* %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]]
+// CHECK:   ret void
 void test_vst1_lane_u16(uint16_t  *a, uint16x4_t b) {
-  // CHECK-LABEL: test_vst1_lane_u16
   vst1_lane_u16(a, b, 3);
-  // CHECK: st1 {{{ *v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_u32(i32* %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   store i32 [[TMP3]], i32* [[TMP4]]
+// CHECK:   ret void
 void test_vst1_lane_u32(uint32_t  *a, uint32x2_t b) {
-  // CHECK-LABEL: test_vst1_lane_u32
   vst1_lane_u32(a, b, 1);
-  // CHECK: st1 {{{ *v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_u64(i64* %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   store i64 [[TMP3]], i64* [[TMP4]]
+// CHECK:   ret void
 void test_vst1_lane_u64(uint64_t  *a, uint64x1_t b) {
-  // CHECK-LABEL: test_vst1_lane_u64
   vst1_lane_u64(a, b, 0);
-  // CHECK: st1 {{{ *v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_s8(i8* %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
+// CHECK:   store i8 [[TMP0]], i8* %a
+// CHECK:   ret void
 void test_vst1_lane_s8(int8_t  *a, int8x8_t b) {
-  // CHECK-LABEL: test_vst1_lane_s8
   vst1_lane_s8(a, b, 7);
-  // CHECK: st1 {{{ *v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_s16(i16* %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]]
+// CHECK:   ret void
 void test_vst1_lane_s16(int16_t  *a, int16x4_t b) {
-  // CHECK-LABEL: test_vst1_lane_s16
   vst1_lane_s16(a, b, 3);
-  // CHECK: st1 {{{ *v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_s32(i32* %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   store i32 [[TMP3]], i32* [[TMP4]]
+// CHECK:   ret void
 void test_vst1_lane_s32(int32_t  *a, int32x2_t b) {
-  // CHECK-LABEL: test_vst1_lane_s32
   vst1_lane_s32(a, b, 1);
-  // CHECK: st1 {{{ *v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_s64(i64* %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   store i64 [[TMP3]], i64* [[TMP4]]
+// CHECK:   ret void
 void test_vst1_lane_s64(int64_t  *a, int64x1_t b) {
-  // CHECK-LABEL: test_vst1_lane_s64
   vst1_lane_s64(a, b, 0);
-  // CHECK: st1 {{{ *v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_f16(half* %a, <4 x half> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]]
+// CHECK:   ret void
 void test_vst1_lane_f16(float16_t  *a, float16x4_t b) {
-  // CHECK-LABEL: test_vst1_lane_f16
   vst1_lane_f16(a, b, 3);
-  // CHECK: st1 {{{ *v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_f32(float* %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to float*
+// CHECK:   store float [[TMP3]], float* [[TMP4]]
+// CHECK:   ret void
 void test_vst1_lane_f32(float32_t  *a, float32x2_t b) {
-  // CHECK-LABEL: test_vst1_lane_f32
   vst1_lane_f32(a, b, 1);
-  // CHECK: st1 {{{ *v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_f64(double* %a, <1 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   [[TMP3:%.*]] = extractelement <1 x double> [[TMP2]], i32 0
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to double*
+// CHECK:   store double [[TMP3]], double* [[TMP4]]
+// CHECK:   ret void
 void test_vst1_lane_f64(float64_t  *a, float64x1_t b) {
-  // CHECK-LABEL: test_vst1_lane_f64
   vst1_lane_f64(a, b, 0);
-  // CHECK: {{st1 { v[0-9]+.d }\[0]|str d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_p8(i8* %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
+// CHECK:   store i8 [[TMP0]], i8* %a
+// CHECK:   ret void
 void test_vst1_lane_p8(poly8_t  *a, poly8x8_t b) {
-  // CHECK-LABEL: test_vst1_lane_p8
   vst1_lane_p8(a, b, 7);
-  // CHECK: st1 {{{ *v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_p16(i16* %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]]
+// CHECK:   ret void
 void test_vst1_lane_p16(poly16_t  *a, poly16x4_t b) {
-  // CHECK-LABEL: test_vst1_lane_p16
   vst1_lane_p16(a, b, 3);
-  // CHECK: st1 {{{ *v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_p64(i64* %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   store i64 [[TMP3]], i64* [[TMP4]]
+// CHECK:   ret void
 void test_vst1_lane_p64(poly64_t  *a, poly64x1_t b) {
-  // CHECK-LABEL: test_vst1_lane_p64
   vst1_lane_p64(a, b, 0);
-  // CHECK: st1 {{{ *v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_u8(i8* %a, [2 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 15, i8* %a)
+// CHECK:   ret void
 void test_vst2q_lane_u8(uint8_t  *a, uint8x16x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_u8
   vst2q_lane_u8(a, b, 15);
-  // CHECK: st2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_u16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_lane_u16(uint16_t  *a, uint16x8x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_u16
   vst2q_lane_u16(a, b, 7);
-  // CHECK: st2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_u32(i32* %a, [2 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v4i32.p0i8(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_lane_u32(uint32_t  *a, uint32x4x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_u32
   vst2q_lane_u32(a, b, 3);
-  // CHECK: st2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_u64(i64* %a, [2 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v2i64.p0i8(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_lane_u64(uint64_t  *a, uint64x2x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_u64
   vst2q_lane_u64(a, b, 1);
-  // CHECK: st2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_s8(i8* %a, [2 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 15, i8* %a)
+// CHECK:   ret void
 void test_vst2q_lane_s8(int8_t  *a, int8x16x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_s8
   vst2q_lane_s8(a, b, 15);
-  // CHECK: st2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_lane_s16(int16_t  *a, int16x8x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_s16
   vst2q_lane_s16(a, b, 7);
-  // CHECK: st2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v4i32.p0i8(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_lane_s32(int32_t  *a, int32x4x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_s32
   vst2q_lane_s32(a, b, 3);
-  // CHECK: st2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_s64(i64* %a, [2 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v2i64.p0i8(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_lane_s64(int64_t  *a, int64x2x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_s64
   vst2q_lane_s64(a, b, 1);
-  // CHECK: st2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_f16(half* %a, [2 x <8 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x half>] [[B]].coerce, [2 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_lane_f16(float16_t  *a, float16x8x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_f16
   vst2q_lane_f16(a, b, 7);
-  // CHECK: st2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x float>] [[B]].coerce, [2 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v4f32.p0i8(<4 x float> [[TMP7]], <4 x float> [[TMP8]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_lane_f32(float32_t  *a, float32x4x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_f32
   vst2q_lane_f32(a, b, 3);
-  // CHECK: st2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_f64(double* %a, [2 x <2 x double>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x double>] [[B]].coerce, [2 x <2 x double>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v2f64.p0i8(<2 x double> [[TMP7]], <2 x double> [[TMP8]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_lane_f64(float64_t  *a, float64x2x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_f64
   vst2q_lane_f64(a, b, 1);
-  // CHECK: st2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_p8(i8* %a, [2 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 15, i8* %a)
+// CHECK:   ret void
 void test_vst2q_lane_p8(poly8_t  *a, poly8x16x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_p8
   vst2q_lane_p8(a, b, 15);
-  // CHECK: st2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_p16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_lane_p16(poly16_t  *a, poly16x8x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_p16
   vst2q_lane_p16(a, b, 7);
-  // CHECK: st2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_p64(i64* %a, [2 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v2i64.p0i8(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_lane_p64(poly64_t  *a, poly64x2x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_p64
   vst2q_lane_p64(a, b, 1);
-  // CHECK: st2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_u8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, i8* %a)
+// CHECK:   ret void
 void test_vst2_lane_u8(uint8_t  *a, uint8x8x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_u8
   vst2_lane_u8(a, b, 7);
-  // CHECK: st2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_u16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_lane_u16(uint16_t  *a, uint16x4x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_u16
   vst2_lane_u16(a, b, 3);
-  // CHECK: st2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_u32(i32* %a, [2 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v2i32.p0i8(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_lane_u32(uint32_t  *a, uint32x2x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_u32
   vst2_lane_u32(a, b, 1);
-  // CHECK: st2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_u64(i64* %a, [2 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v1i64.p0i8(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64 0, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_lane_u64(uint64_t  *a, uint64x1x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_u64
   vst2_lane_u64(a, b, 0);
-  // CHECK: st2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, i8* %a)
+// CHECK:   ret void
 void test_vst2_lane_s8(int8_t  *a, int8x8x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_s8
   vst2_lane_s8(a, b, 7);
-  // CHECK: st2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_lane_s16(int16_t  *a, int16x4x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_s16
   vst2_lane_s16(a, b, 3);
-  // CHECK: st2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v2i32.p0i8(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_lane_s32(int32_t  *a, int32x2x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_s32
   vst2_lane_s32(a, b, 1);
-  // CHECK: st2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_s64(i64* %a, [2 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v1i64.p0i8(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64 0, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_lane_s64(int64_t  *a, int64x1x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_s64
   vst2_lane_s64(a, b, 0);
-  // CHECK: st2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_f16(half* %a, [2 x <4 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x half>] [[B]].coerce, [2 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_lane_f16(float16_t  *a, float16x4x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_f16
   vst2_lane_f16(a, b, 3);
-  // CHECK: st2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x float>] [[B]].coerce, [2 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v2f32.p0i8(<2 x float> [[TMP7]], <2 x float> [[TMP8]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_lane_f32(float32_t  *a, float32x2x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_f32
   vst2_lane_f32(a, b, 1);
-  // CHECK: st2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_f64(double* %a, [2 x <1 x double>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x double>] [[B]].coerce, [2 x <1 x double>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v1f64.p0i8(<1 x double> [[TMP7]], <1 x double> [[TMP8]], i64 0, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_lane_f64(float64_t  *a, float64x1x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_f64
   vst2_lane_f64(a, b, 0);
-  // CHECK: st2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_p8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, i8* %a)
+// CHECK:   ret void
 void test_vst2_lane_p8(poly8_t  *a, poly8x8x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_p8
   vst2_lane_p8(a, b, 7);
-  // CHECK: st2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_p16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_lane_p16(poly16_t  *a, poly16x4x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_p16
   vst2_lane_p16(a, b, 3);
-  // CHECK: st2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_p64(i64* %a, [2 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v1i64.p0i8(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64 0, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_lane_p64(poly64_t  *a, poly64x1x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_p64
   vst2_lane_p64(a, b, 0);
-  // CHECK: st2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_u8(i8* %a, [3 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, i8* %a)
+// CHECK:   ret void
 void test_vst3q_lane_u8(uint8_t  *a, uint8x16x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_u8
   vst3q_lane_u8(a, b, 15);
-  // CHECK: st3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_u16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_lane_u16(uint16_t  *a, uint16x8x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_u16
   vst3q_lane_u16(a, b, 7);
-  // CHECK: st3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_u32(i32* %a, [3 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v4i32.p0i8(<4 x i32> [[TMP9]], <4 x i32> [[TMP1]]0, <4 x i32> [[TMP1]]1, i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_lane_u32(uint32_t  *a, uint32x4x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_u32
   vst3q_lane_u32(a, b, 3);
-  // CHECK: st3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_u64(i64* %a, [3 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v2i64.p0i8(<2 x i64> [[TMP9]], <2 x i64> [[TMP1]]0, <2 x i64> [[TMP1]]1, i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_lane_u64(uint64_t  *a, uint64x2x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_u64
   vst3q_lane_u64(a, b, 1);
-  // CHECK: st3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_s8(i8* %a, [3 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, i8* %a)
+// CHECK:   ret void
 void test_vst3q_lane_s8(int8_t  *a, int8x16x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_s8
   vst3q_lane_s8(a, b, 15);
-  // CHECK: st3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_lane_s16(int16_t  *a, int16x8x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_s16
   vst3q_lane_s16(a, b, 7);
-  // CHECK: st3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v4i32.p0i8(<4 x i32> [[TMP9]], <4 x i32> [[TMP1]]0, <4 x i32> [[TMP1]]1, i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_lane_s32(int32_t  *a, int32x4x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_s32
   vst3q_lane_s32(a, b, 3);
-  // CHECK: st3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_s64(i64* %a, [3 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v2i64.p0i8(<2 x i64> [[TMP9]], <2 x i64> [[TMP1]]0, <2 x i64> [[TMP1]]1, i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_lane_s64(int64_t  *a, int64x2x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_s64
   vst3q_lane_s64(a, b, 1);
-  // CHECK: st3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_f16(half* %a, [3 x <8 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x half>] [[B]].coerce, [3 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_lane_f16(float16_t  *a, float16x8x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_f16
   vst3q_lane_f16(a, b, 7);
-  // CHECK: st3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x float>] [[B]].coerce, [3 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v4f32.p0i8(<4 x float> [[TMP9]], <4 x float> [[TMP1]]0, <4 x float> [[TMP1]]1, i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_lane_f32(float32_t  *a, float32x4x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_f32
   vst3q_lane_f32(a, b, 3);
-  // CHECK: st3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_f64(double* %a, [3 x <2 x double>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x double>] [[B]].coerce, [3 x <2 x double>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v2f64.p0i8(<2 x double> [[TMP9]], <2 x double> [[TMP1]]0, <2 x double> [[TMP1]]1, i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_lane_f64(float64_t  *a, float64x2x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_f64
   vst3q_lane_f64(a, b, 1);
-  // CHECK: st3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_p8(i8* %a, [3 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, i8* %a)
+// CHECK:   ret void
 void test_vst3q_lane_p8(poly8_t  *a, poly8x16x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_p8
   vst3q_lane_p8(a, b, 15);
-  // CHECK: st3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_p16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_lane_p16(poly16_t  *a, poly16x8x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_p16
   vst3q_lane_p16(a, b, 7);
-  // CHECK: st3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_p64(i64* %a, [3 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v2i64.p0i8(<2 x i64> [[TMP9]], <2 x i64> [[TMP1]]0, <2 x i64> [[TMP1]]1, i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_lane_p64(poly64_t  *a, poly64x2x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_p64
   vst3q_lane_p64(a, b, 1);
-  // CHECK: st3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_u8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* %a)
+// CHECK:   ret void
 void test_vst3_lane_u8(uint8_t  *a, uint8x8x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_u8
   vst3_lane_u8(a, b, 7);
-  // CHECK: st3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_u16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_lane_u16(uint16_t  *a, uint16x4x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_u16
   vst3_lane_u16(a, b, 3);
-  // CHECK: st3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_u32(i32* %a, [3 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v2i32.p0i8(<2 x i32> [[TMP9]], <2 x i32> [[TMP1]]0, <2 x i32> [[TMP1]]1, i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_lane_u32(uint32_t  *a, uint32x2x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_u32
   vst3_lane_u32(a, b, 1);
-  // CHECK: st3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_u64(i64* %a, [3 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v1i64.p0i8(<1 x i64> [[TMP9]], <1 x i64> [[TMP1]]0, <1 x i64> [[TMP1]]1, i64 0, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_lane_u64(uint64_t  *a, uint64x1x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_u64
   vst3_lane_u64(a, b, 0);
-  // CHECK: st3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* %a)
+// CHECK:   ret void
 void test_vst3_lane_s8(int8_t  *a, int8x8x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_s8
   vst3_lane_s8(a, b, 7);
-  // CHECK: st3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_lane_s16(int16_t  *a, int16x4x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_s16
   vst3_lane_s16(a, b, 3);
-  // CHECK: st3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v2i32.p0i8(<2 x i32> [[TMP9]], <2 x i32> [[TMP1]]0, <2 x i32> [[TMP1]]1, i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_lane_s32(int32_t  *a, int32x2x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_s32
   vst3_lane_s32(a, b, 1);
-  // CHECK: st3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_s64(i64* %a, [3 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v1i64.p0i8(<1 x i64> [[TMP9]], <1 x i64> [[TMP1]]0, <1 x i64> [[TMP1]]1, i64 0, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_lane_s64(int64_t  *a, int64x1x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_s64
   vst3_lane_s64(a, b, 0);
-  // CHECK: st3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_f16(half* %a, [3 x <4 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x half>] [[B]].coerce, [3 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_lane_f16(float16_t  *a, float16x4x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_f16
   vst3_lane_f16(a, b, 3);
-  // CHECK: st3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x float>] [[B]].coerce, [3 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v2f32.p0i8(<2 x float> [[TMP9]], <2 x float> [[TMP1]]0, <2 x float> [[TMP1]]1, i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_lane_f32(float32_t  *a, float32x2x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_f32
   vst3_lane_f32(a, b, 1);
-  // CHECK: st3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_f64(double* %a, [3 x <1 x double>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x double>] [[B]].coerce, [3 x <1 x double>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v1f64.p0i8(<1 x double> [[TMP9]], <1 x double> [[TMP1]]0, <1 x double> [[TMP1]]1, i64 0, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_lane_f64(float64_t  *a, float64x1x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_f64
   vst3_lane_f64(a, b, 0);
-  // CHECK: st3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_p8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* %a)
+// CHECK:   ret void
 void test_vst3_lane_p8(poly8_t  *a, poly8x8x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_p8
   vst3_lane_p8(a, b, 7);
-  // CHECK: st3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_p16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_lane_p16(poly16_t  *a, poly16x4x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_p16
   vst3_lane_p16(a, b, 3);
-  // CHECK: st3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_p64(i64* %a, [3 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v1i64.p0i8(<1 x i64> [[TMP9]], <1 x i64> [[TMP1]]0, <1 x i64> [[TMP1]]1, i64 0, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_lane_p64(poly64_t  *a, poly64x1x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_p64
   vst3_lane_p64(a, b, 0);
-  // CHECK: st3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_u8(i8* %a, [4 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]6, align 16
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, i8* %a)
+// CHECK:   ret void
 void test_vst4q_lane_u8(uint8_t  *a, uint8x16x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_u8
   vst4q_lane_u8(a, b, 15);
-  // CHECK: st4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_u16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v8i16.p0i8(<8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_lane_u16(uint16_t  *a, uint16x8x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_u16
   vst4q_lane_u16(a, b, 7);
-  // CHECK: st4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_u32(i32* %a, [4 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <4 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v4i32.p0i8(<4 x i32> [[TMP1]]1, <4 x i32> [[TMP1]]2, <4 x i32> [[TMP1]]3, <4 x i32> [[TMP1]]4, i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_lane_u32(uint32_t  *a, uint32x4x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_u32
   vst4q_lane_u32(a, b, 3);
-  // CHECK: st4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_u64(i64* %a, [4 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v2i64.p0i8(<2 x i64> [[TMP1]]1, <2 x i64> [[TMP1]]2, <2 x i64> [[TMP1]]3, <2 x i64> [[TMP1]]4, i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_lane_u64(uint64_t  *a, uint64x2x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_u64
   vst4q_lane_u64(a, b, 1);
-  // CHECK: st4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_s8(i8* %a, [4 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]6, align 16
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, i8* %a)
+// CHECK:   ret void
 void test_vst4q_lane_s8(int8_t  *a, int8x16x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_s8
   vst4q_lane_s8(a, b, 15);
-  // CHECK: st4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v8i16.p0i8(<8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_lane_s16(int16_t  *a, int16x8x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_s16
   vst4q_lane_s16(a, b, 7);
-  // CHECK: st4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <4 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v4i32.p0i8(<4 x i32> [[TMP1]]1, <4 x i32> [[TMP1]]2, <4 x i32> [[TMP1]]3, <4 x i32> [[TMP1]]4, i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_lane_s32(int32_t  *a, int32x4x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_s32
   vst4q_lane_s32(a, b, 3);
-  // CHECK: st4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_s64(i64* %a, [4 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v2i64.p0i8(<2 x i64> [[TMP1]]1, <2 x i64> [[TMP1]]2, <2 x i64> [[TMP1]]3, <2 x i64> [[TMP1]]4, i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_lane_s64(int64_t  *a, int64x2x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_s64
   vst4q_lane_s64(a, b, 1);
-  // CHECK: st4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_f16(half* %a, [4 x <8 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x half>] [[B]].coerce, [4 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v8i16.p0i8(<8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_lane_f16(float16_t  *a, float16x8x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_f16
   vst4q_lane_f16(a, b, 7);
-  // CHECK: st4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x float>] [[B]].coerce, [4 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <4 x float>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v4f32.p0i8(<4 x float> [[TMP1]]1, <4 x float> [[TMP1]]2, <4 x float> [[TMP1]]3, <4 x float> [[TMP1]]4, i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_lane_f32(float32_t  *a, float32x4x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_f32
   vst4q_lane_f32(a, b, 3);
-  // CHECK: st4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_f64(double* %a, [4 x <2 x double>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x double>] [[B]].coerce, [4 x <2 x double>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x double> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <2 x double>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v2f64.p0i8(<2 x double> [[TMP1]]1, <2 x double> [[TMP1]]2, <2 x double> [[TMP1]]3, <2 x double> [[TMP1]]4, i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_lane_f64(float64_t  *a, float64x2x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_f64
   vst4q_lane_f64(a, b, 1);
-  // CHECK: st4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_p8(i8* %a, [4 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]]6, align 16
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, i8* %a)
+// CHECK:   ret void
 void test_vst4q_lane_p8(poly8_t  *a, poly8x16x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_p8
   vst4q_lane_p8(a, b, 15);
-  // CHECK: st4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_p16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v8i16.p0i8(<8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_lane_p16(poly16_t  *a, poly16x8x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_p16
   vst4q_lane_p16(a, b, 7);
-  // CHECK: st4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_p64(i64* %a, [4 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP1]]0 to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v2i64.p0i8(<2 x i64> [[TMP1]]1, <2 x i64> [[TMP1]]2, <2 x i64> [[TMP1]]3, <2 x i64> [[TMP1]]4, i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_lane_p64(poly64_t  *a, poly64x2x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_p64
   vst4q_lane_p64(a, b, 1);
-  // CHECK: st4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_u8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]6, align 8
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* %a)
+// CHECK:   ret void
 void test_vst4_lane_u8(uint8_t  *a, uint8x8x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_u8
   vst4_lane_u8(a, b, 7);
-  // CHECK: st4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_u16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v4i16.p0i8(<4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_lane_u16(uint16_t  *a, uint16x4x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_u16
   vst4_lane_u16(a, b, 3);
-  // CHECK: st4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_u32(i32* %a, [4 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <2 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v2i32.p0i8(<2 x i32> [[TMP1]]1, <2 x i32> [[TMP1]]2, <2 x i32> [[TMP1]]3, <2 x i32> [[TMP1]]4, i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_lane_u32(uint32_t  *a, uint32x2x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_u32
   vst4_lane_u32(a, b, 1);
-  // CHECK: st4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_u64(i64* %a, [4 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v1i64.p0i8(<1 x i64> [[TMP1]]1, <1 x i64> [[TMP1]]2, <1 x i64> [[TMP1]]3, <1 x i64> [[TMP1]]4, i64 0, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_lane_u64(uint64_t  *a, uint64x1x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_u64
   vst4_lane_u64(a, b, 0);
-  // CHECK: st4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]6, align 8
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* %a)
+// CHECK:   ret void
 void test_vst4_lane_s8(int8_t  *a, int8x8x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_s8
   vst4_lane_s8(a, b, 7);
-  // CHECK: st4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v4i16.p0i8(<4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_lane_s16(int16_t  *a, int16x4x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_s16
   vst4_lane_s16(a, b, 3);
-  // CHECK: st4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <2 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v2i32.p0i8(<2 x i32> [[TMP1]]1, <2 x i32> [[TMP1]]2, <2 x i32> [[TMP1]]3, <2 x i32> [[TMP1]]4, i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_lane_s32(int32_t  *a, int32x2x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_s32
   vst4_lane_s32(a, b, 1);
-  // CHECK: st4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_s64(i64* %a, [4 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v1i64.p0i8(<1 x i64> [[TMP1]]1, <1 x i64> [[TMP1]]2, <1 x i64> [[TMP1]]3, <1 x i64> [[TMP1]]4, i64 0, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_lane_s64(int64_t  *a, int64x1x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_s64
   vst4_lane_s64(a, b, 0);
-  // CHECK: st4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_f16(half* %a, [4 x <4 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x half>] [[B]].coerce, [4 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v4i16.p0i8(<4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_lane_f16(float16_t  *a, float16x4x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_f16
   vst4_lane_f16(a, b, 3);
-  // CHECK: st4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x float>] [[B]].coerce, [4 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <2 x float>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v2f32.p0i8(<2 x float> [[TMP1]]1, <2 x float> [[TMP1]]2, <2 x float> [[TMP1]]3, <2 x float> [[TMP1]]4, i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_lane_f32(float32_t  *a, float32x2x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_f32
   vst4_lane_f32(a, b, 1);
-  // CHECK: st4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_f64(double* %a, [4 x <1 x double>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x double>] [[B]].coerce, [4 x <1 x double>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <1 x double> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <1 x double>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v1f64.p0i8(<1 x double> [[TMP1]]1, <1 x double> [[TMP1]]2, <1 x double> [[TMP1]]3, <1 x double> [[TMP1]]4, i64 0, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_lane_f64(float64_t  *a, float64x1x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_f64
   vst4_lane_f64(a, b, 0);
-  // CHECK: st4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_p8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]6, align 8
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* %a)
+// CHECK:   ret void
 void test_vst4_lane_p8(poly8_t  *a, poly8x8x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_p8
   vst4_lane_p8(a, b, 7);
-  // CHECK: st4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_p16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v4i16.p0i8(<4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_lane_p16(poly16_t  *a, poly16x4x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_p16
   vst4_lane_p16(a, b, 3);
-  // CHECK: st4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_p64(i64* %a, [4 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP1]]0 to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v1i64.p0i8(<1 x i64> [[TMP1]]1, <1 x i64> [[TMP1]]2, <1 x i64> [[TMP1]]3, <1 x i64> [[TMP1]]4, i64 0, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_lane_p64(poly64_t  *a, poly64x1x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_p64
   vst4_lane_p64(a, b, 0);
-  // CHECK: st4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }

Modified: cfe/trunk/test/CodeGen/aarch64-neon-misc.c
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/CodeGen/aarch64-neon-misc.c?rev=263048&r1=263047&r2=263048&view=diff
==============================================================================
--- cfe/trunk/test/CodeGen/aarch64-neon-misc.c (original)
+++ cfe/trunk/test/CodeGen/aarch64-neon-misc.c Wed Mar  9 12:54:42 2016
@@ -1,2041 +1,3068 @@
-// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+// RUN:  -fallow-half-arguments-and-returns -emit-llvm -o - %s \
+// RUN: | opt -S -mem2reg | FileCheck %s
 
 // Test new aarch64 intrinsics and types
 
 #include <arm_neon.h>
 
-// CHECK-LABEL: test_vceqz_s8
-// CHECK: cmeq  {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}}
+// CHECK-LABEL: define <8 x i8> @test_vceqz_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp eq <8 x i8> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VCEQZ_I]]
 uint8x8_t test_vceqz_s8(int8x8_t a) {
   return vceqz_s8(a);
 }
 
-// CHECK-LABEL: test_vceqz_s16
-// CHECK: cmeq  {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}}
+// CHECK-LABEL: define <4 x i16> @test_vceqz_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = icmp eq <4 x i16> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VCEQZ_I]]
 uint16x4_t test_vceqz_s16(int16x4_t a) {
   return vceqz_s16(a);
 }
 
-// CHECK-LABEL: test_vceqz_s32
-// CHECK: cmeq  {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0x0|0}}
+// CHECK-LABEL: define <2 x i32> @test_vceqz_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = icmp eq <2 x i32> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VCEQZ_I]]
 uint32x2_t test_vceqz_s32(int32x2_t a) {
   return vceqz_s32(a);
 }
 
-// CHECK-LABEL: test_vceqz_s64
-// CHECK: cmeq {{d[0-9]+}}, {{d[0-9]+}}, #{{0x0|0}}
+// CHECK-LABEL: define <1 x i64> @test_vceqz_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP2:%.*]] = icmp eq <1 x i64> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VCEQZ_I]]
 uint64x1_t test_vceqz_s64(int64x1_t a) {
   return vceqz_s64(a);
 }
 
-// CHECK-LABEL: test_vceqz_u64
-// CHECK: cmeq {{d[0-9]+}}, {{d[0-9]+}}, #{{0x0|0}}
+// CHECK-LABEL: define <1 x i64> @test_vceqz_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP2:%.*]] = icmp eq <1 x i64> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VCEQZ_I]]
 uint64x1_t test_vceqz_u64(uint64x1_t a) {
   return vceqz_u64(a);
 }
 
-// CHECK-LABEL: test_vceqz_p64
-// CHECK: cmeq {{d[0-9]+}}, {{d[0-9]+}}, #{{0x0|0}}
+// CHECK-LABEL: define <1 x i64> @test_vceqz_p64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP2:%.*]] = icmp eq <1 x i64> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VCEQZ_I]]
 uint64x1_t test_vceqz_p64(poly64x1_t a) {
   return vceqz_p64(a);
 }
 
-// CHECK-LABEL: test_vceqzq_s8
-// CHECK: cmeq  {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}}
+// CHECK-LABEL: define <16 x i8> @test_vceqzq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp eq <16 x i8> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[VCEQZ_I]]
 uint8x16_t test_vceqzq_s8(int8x16_t a) {
   return vceqzq_s8(a);
 }
 
-// CHECK-LABEL: test_vceqzq_s16
-// CHECK: cmeq  {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}}
+// CHECK-LABEL: define <8 x i16> @test_vceqzq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = icmp eq <8 x i16> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VCEQZ_I]]
 uint16x8_t test_vceqzq_s16(int16x8_t a) {
   return vceqzq_s16(a);
 }
 
-// CHECK-LABEL: test_vceqzq_s32
-// CHECK: cmeq  {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0x0|0}}
+// CHECK-LABEL: define <4 x i32> @test_vceqzq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VCEQZ_I]]
 uint32x4_t test_vceqzq_s32(int32x4_t a) {
   return vceqzq_s32(a);
 }
 
-// CHECK-LABEL: test_vceqzq_s64
-// CHECK: cmeq  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0x0|0}}
+// CHECK-LABEL: define <2 x i64> @test_vceqzq_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = icmp eq <2 x i64> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VCEQZ_I]]
 uint64x2_t test_vceqzq_s64(int64x2_t a) {
   return vceqzq_s64(a);
 }
 
-// CHECK-LABEL: test_vceqz_u8
-// CHECK: cmeq  {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}}
+// CHECK-LABEL: define <8 x i8> @test_vceqz_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp eq <8 x i8> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VCEQZ_I]]
 uint8x8_t test_vceqz_u8(uint8x8_t a) {
   return vceqz_u8(a);
 }
 
-// CHECK-LABEL: test_vceqz_u16
-// CHECK: cmeq  {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}}
+// CHECK-LABEL: define <4 x i16> @test_vceqz_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = icmp eq <4 x i16> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VCEQZ_I]]
 uint16x4_t test_vceqz_u16(uint16x4_t a) {
   return vceqz_u16(a);
 }
 
-// CHECK-LABEL: test_vceqz_u32
-// CHECK: cmeq  {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0x0|0}}
+// CHECK-LABEL: define <2 x i32> @test_vceqz_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = icmp eq <2 x i32> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VCEQZ_I]]
 uint32x2_t test_vceqz_u32(uint32x2_t a) {
   return vceqz_u32(a);
 }
 
-// CHECK-LABEL: test_vceqzq_u8
-// CHECK: cmeq  {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}}
+// CHECK-LABEL: define <16 x i8> @test_vceqzq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp eq <16 x i8> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[VCEQZ_I]]
 uint8x16_t test_vceqzq_u8(uint8x16_t a) {
   return vceqzq_u8(a);
 }
 
-// CHECK-LABEL: test_vceqzq_u16
-// CHECK: cmeq  {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}}
+// CHECK-LABEL: define <8 x i16> @test_vceqzq_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = icmp eq <8 x i16> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VCEQZ_I]]
 uint16x8_t test_vceqzq_u16(uint16x8_t a) {
   return vceqzq_u16(a);
 }
 
-// CHECK-LABEL: test_vceqzq_u32
-// CHECK: cmeq  {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0x0|0}}
+// CHECK-LABEL: define <4 x i32> @test_vceqzq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VCEQZ_I]]
 uint32x4_t test_vceqzq_u32(uint32x4_t a) {
   return vceqzq_u32(a);
 }
 
-// CHECK-LABEL: test_vceqzq_u64
-// CHECK: cmeq  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0x0|0}}
+// CHECK-LABEL: define <2 x i64> @test_vceqzq_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = icmp eq <2 x i64> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VCEQZ_I]]
 uint64x2_t test_vceqzq_u64(uint64x2_t a) {
   return vceqzq_u64(a);
 }
 
-// CHECK-LABEL: test_vceqz_f32
-// CHECK: fcmeq  {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+// CHECK-LABEL: define <2 x i32> @test_vceqz_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP2:%.*]] = fcmp oeq <2 x float> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VCEQZ_I]]
 uint32x2_t test_vceqz_f32(float32x2_t a) {
   return vceqz_f32(a);
 }
 
-// CHECK-LABEL: test_vceqz_f64
-// CHECK: fcmeq  {{d[0-9]+}}, {{d[0-9]+}}, #0
+// CHECK-LABEL: define <1 x i64> @test_vceqz_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[TMP2:%.*]] = fcmp oeq <1 x double> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VCEQZ_I]]
 uint64x1_t test_vceqz_f64(float64x1_t a) {
   return vceqz_f64(a);
 }
 
-// CHECK-LABEL: test_vceqzq_f32
-// CHECK: fcmeq  {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+// CHECK-LABEL: define <4 x i32> @test_vceqzq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP2:%.*]] = fcmp oeq <4 x float> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VCEQZ_I]]
 uint32x4_t test_vceqzq_f32(float32x4_t a) {
   return vceqzq_f32(a);
 }
 
-// CHECK-LABEL: test_vceqz_p8
-// CHECK: cmeq  {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}}
+// CHECK-LABEL: define <8 x i8> @test_vceqz_p8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp eq <8 x i8> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VCEQZ_I]]
 uint8x8_t test_vceqz_p8(poly8x8_t a) {
   return vceqz_p8(a);
 }
 
-// CHECK-LABEL: test_vceqzq_p8
-// CHECK: cmeq  {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}}
+// CHECK-LABEL: define <16 x i8> @test_vceqzq_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp eq <16 x i8> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[VCEQZ_I]]
 uint8x16_t test_vceqzq_p8(poly8x16_t a) {
   return vceqzq_p8(a);
 }
 
-// CHECK-LABEL: test_vceqz_p16
-// CHECK: cmeq  {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}}
+// CHECK-LABEL: define <4 x i16> @test_vceqz_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = icmp eq <4 x i16> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VCEQZ_I]]
 uint16x4_t test_vceqz_p16(poly16x4_t a) {
   return vceqz_p16(a);
 }
 
-// CHECK-LABEL: test_vceqzq_p16
-// CHECK: cmeq  {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}}
+// CHECK-LABEL: define <8 x i16> @test_vceqzq_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = icmp eq <8 x i16> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VCEQZ_I]]
 uint16x8_t test_vceqzq_p16(poly16x8_t a) {
   return vceqzq_p16(a);
 }
 
-// CHECK-LABEL: test_vceqzq_f64
-// CHECK: fcmeq  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+// CHECK-LABEL: define <2 x i64> @test_vceqzq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[TMP2:%.*]] = fcmp oeq <2 x double> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VCEQZ_I]]
 uint64x2_t test_vceqzq_f64(float64x2_t a) {
   return vceqzq_f64(a);
 }
 
-// CHECK-LABEL: test_vceqzq_p64
-// CHECK: cmeq  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+// CHECK-LABEL: define <2 x i64> @test_vceqzq_p64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = icmp eq <2 x i64> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VCEQZ_I]]
 uint64x2_t test_vceqzq_p64(poly64x2_t a) {
   return vceqzq_p64(a);
 }
 
-// CHECK-LABEL: test_vcgez_s8
-// CHECK: cmge  {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}}
+// CHECK-LABEL: define <8 x i8> @test_vcgez_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp sge <8 x i8> %a, zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VCGEZ_I]]
 uint8x8_t test_vcgez_s8(int8x8_t a) {
   return vcgez_s8(a);
 }
 
-// CHECK-LABEL: test_vcgez_s16
-// CHECK: cmge  {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}}
+// CHECK-LABEL: define <4 x i16> @test_vcgez_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = icmp sge <4 x i16> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VCGEZ_I]]
 uint16x4_t test_vcgez_s16(int16x4_t a) {
   return vcgez_s16(a);
 }
 
-// CHECK-LABEL: test_vcgez_s32
-// CHECK: cmge  {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0x0|0}}
+// CHECK-LABEL: define <2 x i32> @test_vcgez_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = icmp sge <2 x i32> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VCGEZ_I]]
 uint32x2_t test_vcgez_s32(int32x2_t a) {
   return vcgez_s32(a);
 }
 
-// CHECK-LABEL: test_vcgez_s64
-// CHECK: cmge {{d[0-9]+}}, {{d[0-9]+}}, #{{0x0|0}}
+// CHECK-LABEL: define <1 x i64> @test_vcgez_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP2:%.*]] = icmp sge <1 x i64> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VCGEZ_I]]
 uint64x1_t test_vcgez_s64(int64x1_t a) {
   return vcgez_s64(a);
 }
 
-// CHECK-LABEL: test_vcgezq_s8
-// CHECK: cmge  {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}}
+// CHECK-LABEL: define <16 x i8> @test_vcgezq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp sge <16 x i8> %a, zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[VCGEZ_I]]
 uint8x16_t test_vcgezq_s8(int8x16_t a) {
   return vcgezq_s8(a);
 }
 
-// CHECK-LABEL: test_vcgezq_s16
-// CHECK: cmge  {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}}
+// CHECK-LABEL: define <8 x i16> @test_vcgezq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = icmp sge <8 x i16> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VCGEZ_I]]
 uint16x8_t test_vcgezq_s16(int16x8_t a) {
   return vcgezq_s16(a);
 }
 
-// CHECK-LABEL: test_vcgezq_s32
-// CHECK: cmge  {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0x0|0}}
+// CHECK-LABEL: define <4 x i32> @test_vcgezq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = icmp sge <4 x i32> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VCGEZ_I]]
 uint32x4_t test_vcgezq_s32(int32x4_t a) {
   return vcgezq_s32(a);
 }
 
-// CHECK-LABEL: test_vcgezq_s64
-// CHECK: cmge  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0x0|0}}
+// CHECK-LABEL: define <2 x i64> @test_vcgezq_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = icmp sge <2 x i64> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VCGEZ_I]]
 uint64x2_t test_vcgezq_s64(int64x2_t a) {
   return vcgezq_s64(a);
 }
 
-// CHECK-LABEL: test_vcgez_f32
-// CHECK: fcmge  {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+// CHECK-LABEL: define <2 x i32> @test_vcgez_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP2:%.*]] = fcmp oge <2 x float> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VCGEZ_I]]
 uint32x2_t test_vcgez_f32(float32x2_t a) {
   return vcgez_f32(a);
 }
 
-// CHECK-LABEL: test_vcgez_f64
-// CHECK: fcmge  {{d[0-9]+}}, {{d[0-9]+}}, #0
+// CHECK-LABEL: define <1 x i64> @test_vcgez_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[TMP2:%.*]] = fcmp oge <1 x double> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VCGEZ_I]]
 uint64x1_t test_vcgez_f64(float64x1_t a) {
   return vcgez_f64(a);
 }
 
-// CHECK-LABEL: test_vcgezq_f32
-// CHECK: fcmge  {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+// CHECK-LABEL: define <4 x i32> @test_vcgezq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP2:%.*]] = fcmp oge <4 x float> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VCGEZ_I]]
 uint32x4_t test_vcgezq_f32(float32x4_t a) {
   return vcgezq_f32(a);
 }
 
-// CHECK-LABEL: test_vcgezq_f64
-// CHECK: fcmge  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+// CHECK-LABEL: define <2 x i64> @test_vcgezq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[TMP2:%.*]] = fcmp oge <2 x double> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VCGEZ_I]]
 uint64x2_t test_vcgezq_f64(float64x2_t a) {
   return vcgezq_f64(a);
 }
 
-// CHECK-LABEL: test_vclez_s8
-// CHECK: cmle  {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}}
+// CHECK-LABEL: define <8 x i8> @test_vclez_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp sle <8 x i8> %a, zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VCLEZ_I]]
 uint8x8_t test_vclez_s8(int8x8_t a) {
   return vclez_s8(a);
 }
 
-// CHECK-LABEL: test_vclez_s16
-// CHECK: cmle  {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}}
+// CHECK-LABEL: define <4 x i16> @test_vclez_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = icmp sle <4 x i16> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VCLEZ_I]]
 uint16x4_t test_vclez_s16(int16x4_t a) {
   return vclez_s16(a);
 }
 
-// CHECK-LABEL: test_vclez_s32
-// CHECK: cmle  {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0x0|0}}
+// CHECK-LABEL: define <2 x i32> @test_vclez_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = icmp sle <2 x i32> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VCLEZ_I]]
 uint32x2_t test_vclez_s32(int32x2_t a) {
   return vclez_s32(a);
 }
 
-// CHECK-LABEL: test_vclez_s64
-// CHECK: cmle {{d[0-9]+}}, {{d[0-9]+}}, #{{0x0|0}}
+// CHECK-LABEL: define <1 x i64> @test_vclez_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP2:%.*]] = icmp sle <1 x i64> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VCLEZ_I]]
 uint64x1_t test_vclez_s64(int64x1_t a) {
   return vclez_s64(a);
 }
 
-// CHECK-LABEL: test_vclezq_s8
-// CHECK: cmle  {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}}
+// CHECK-LABEL: define <16 x i8> @test_vclezq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp sle <16 x i8> %a, zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[VCLEZ_I]]
 uint8x16_t test_vclezq_s8(int8x16_t a) {
   return vclezq_s8(a);
 }
 
-// CHECK-LABEL: test_vclezq_s16
-// CHECK: cmle  {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}}
+// CHECK-LABEL: define <8 x i16> @test_vclezq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = icmp sle <8 x i16> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VCLEZ_I]]
 uint16x8_t test_vclezq_s16(int16x8_t a) {
   return vclezq_s16(a);
 }
 
-// CHECK-LABEL: test_vclezq_s32
-// CHECK: cmle  {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0x0|0}}
+// CHECK-LABEL: define <4 x i32> @test_vclezq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = icmp sle <4 x i32> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VCLEZ_I]]
 uint32x4_t test_vclezq_s32(int32x4_t a) {
   return vclezq_s32(a);
 }
 
-// CHECK-LABEL: test_vclezq_s64
-// CHECK: cmle  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0x0|0}}
+// CHECK-LABEL: define <2 x i64> @test_vclezq_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = icmp sle <2 x i64> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VCLEZ_I]]
 uint64x2_t test_vclezq_s64(int64x2_t a) {
   return vclezq_s64(a);
 }
 
-// CHECK-LABEL: test_vclez_f32
-// CHECK: fcmle  {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+// CHECK-LABEL: define <2 x i32> @test_vclez_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP2:%.*]] = fcmp ole <2 x float> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VCLEZ_I]]
 uint32x2_t test_vclez_f32(float32x2_t a) {
   return vclez_f32(a);
 }
 
-// CHECK-LABEL: test_vclez_f64
-// CHECK: fcmle  {{d[0-9]+}}, {{d[0-9]+}}, #0
+// CHECK-LABEL: define <1 x i64> @test_vclez_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[TMP2:%.*]] = fcmp ole <1 x double> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VCLEZ_I]]
 uint64x1_t test_vclez_f64(float64x1_t a) {
   return vclez_f64(a);
 }
 
-// CHECK-LABEL: test_vclezq_f32
-// CHECK: fcmle  {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+// CHECK-LABEL: define <4 x i32> @test_vclezq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP2:%.*]] = fcmp ole <4 x float> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VCLEZ_I]]
 uint32x4_t test_vclezq_f32(float32x4_t a) {
   return vclezq_f32(a);
 }
 
-// CHECK-LABEL: test_vclezq_f64
-// CHECK: fcmle  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+// CHECK-LABEL: define <2 x i64> @test_vclezq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[TMP2:%.*]] = fcmp ole <2 x double> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VCLEZ_I]]
 uint64x2_t test_vclezq_f64(float64x2_t a) {
   return vclezq_f64(a);
 }
 
-// CHECK-LABEL: test_vcgtz_s8
-// CHECK: cmgt  {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}}
+// CHECK-LABEL: define <8 x i8> @test_vcgtz_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp sgt <8 x i8> %a, zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VCGTZ_I]]
 uint8x8_t test_vcgtz_s8(int8x8_t a) {
   return vcgtz_s8(a);
 }
 
-// CHECK-LABEL: test_vcgtz_s16
-// CHECK: cmgt  {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}}
+// CHECK-LABEL: define <4 x i16> @test_vcgtz_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = icmp sgt <4 x i16> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VCGTZ_I]]
 uint16x4_t test_vcgtz_s16(int16x4_t a) {
   return vcgtz_s16(a);
 }
 
-// CHECK-LABEL: test_vcgtz_s32
-// CHECK: cmgt  {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0x0|0}}
+// CHECK-LABEL: define <2 x i32> @test_vcgtz_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = icmp sgt <2 x i32> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VCGTZ_I]]
 uint32x2_t test_vcgtz_s32(int32x2_t a) {
   return vcgtz_s32(a);
 }
 
-// CHECK-LABEL: test_vcgtz_s64
-// CHECK: cmgt {{d[0-9]+}}, {{d[0-9]+}}, #{{0x0|0}}
+// CHECK-LABEL: define <1 x i64> @test_vcgtz_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP2:%.*]] = icmp sgt <1 x i64> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VCGTZ_I]]
 uint64x1_t test_vcgtz_s64(int64x1_t a) {
   return vcgtz_s64(a);
 }
 
-// CHECK-LABEL: test_vcgtzq_s8
-// CHECK: cmgt  {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}}
+// CHECK-LABEL: define <16 x i8> @test_vcgtzq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp sgt <16 x i8> %a, zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[VCGTZ_I]]
 uint8x16_t test_vcgtzq_s8(int8x16_t a) {
   return vcgtzq_s8(a);
 }
 
-// CHECK-LABEL: test_vcgtzq_s16
-// CHECK: cmgt  {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}}
+// CHECK-LABEL: define <8 x i16> @test_vcgtzq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = icmp sgt <8 x i16> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VCGTZ_I]]
 uint16x8_t test_vcgtzq_s16(int16x8_t a) {
   return vcgtzq_s16(a);
 }
 
-// CHECK-LABEL: test_vcgtzq_s32
-// CHECK: cmgt  {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0x0|0}}
+// CHECK-LABEL: define <4 x i32> @test_vcgtzq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = icmp sgt <4 x i32> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VCGTZ_I]]
 uint32x4_t test_vcgtzq_s32(int32x4_t a) {
   return vcgtzq_s32(a);
 }
 
-// CHECK-LABEL: test_vcgtzq_s64
-// CHECK: cmgt  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0x0|0}}
+// CHECK-LABEL: define <2 x i64> @test_vcgtzq_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = icmp sgt <2 x i64> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VCGTZ_I]]
 uint64x2_t test_vcgtzq_s64(int64x2_t a) {
   return vcgtzq_s64(a);
 }
 
-// CHECK-LABEL: test_vcgtz_f32
-// CHECK: fcmgt  {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+// CHECK-LABEL: define <2 x i32> @test_vcgtz_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP2:%.*]] = fcmp ogt <2 x float> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VCGTZ_I]]
 uint32x2_t test_vcgtz_f32(float32x2_t a) {
   return vcgtz_f32(a);
 }
 
-// CHECK-LABEL: test_vcgtz_f64
-// CHECK: fcmgt  {{d[0-9]+}}, {{d[0-9]+}}, #0
+// CHECK-LABEL: define <1 x i64> @test_vcgtz_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[TMP2:%.*]] = fcmp ogt <1 x double> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VCGTZ_I]]
 uint64x1_t test_vcgtz_f64(float64x1_t a) {
   return vcgtz_f64(a);
 }
 
-// CHECK-LABEL: test_vcgtzq_f32
-// CHECK: fcmgt  {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+// CHECK-LABEL: define <4 x i32> @test_vcgtzq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP2:%.*]] = fcmp ogt <4 x float> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VCGTZ_I]]
 uint32x4_t test_vcgtzq_f32(float32x4_t a) {
   return vcgtzq_f32(a);
 }
 
-// CHECK-LABEL: test_vcgtzq_f64
-// CHECK: fcmgt  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+// CHECK-LABEL: define <2 x i64> @test_vcgtzq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[TMP2:%.*]] = fcmp ogt <2 x double> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VCGTZ_I]]
 uint64x2_t test_vcgtzq_f64(float64x2_t a) {
   return vcgtzq_f64(a);
 }
 
-// CHECK-LABEL: test_vcltz_s8
-// CHECK: sshr  {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #7
+// CHECK-LABEL: define <8 x i8> @test_vcltz_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp slt <8 x i8> %a, zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VCLTZ_I]]
 uint8x8_t test_vcltz_s8(int8x8_t a) {
   return vcltz_s8(a);
 }
 
-// CHECK-LABEL: test_vcltz_s16
-// CHECK: sshr  {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #15
+// CHECK-LABEL: define <4 x i16> @test_vcltz_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = icmp slt <4 x i16> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VCLTZ_I]]
 uint16x4_t test_vcltz_s16(int16x4_t a) {
   return vcltz_s16(a);
 }
 
-// CHECK-LABEL: test_vcltz_s32
-// CHECK: sshr  {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #31
+// CHECK-LABEL: define <2 x i32> @test_vcltz_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = icmp slt <2 x i32> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VCLTZ_I]]
 uint32x2_t test_vcltz_s32(int32x2_t a) {
   return vcltz_s32(a);
 }
 
-// CHECK-LABEL: test_vcltz_s64
-// CHECK: sshr {{d[0-9]+}}, {{d[0-9]+}}, #63
+// CHECK-LABEL: define <1 x i64> @test_vcltz_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP2:%.*]] = icmp slt <1 x i64> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VCLTZ_I]]
 uint64x1_t test_vcltz_s64(int64x1_t a) {
   return vcltz_s64(a);
 }
 
-// CHECK-LABEL: test_vcltzq_s8
-// CHECK: sshr  {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #7
+// CHECK-LABEL: define <16 x i8> @test_vcltzq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp slt <16 x i8> %a, zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[VCLTZ_I]]
 uint8x16_t test_vcltzq_s8(int8x16_t a) {
   return vcltzq_s8(a);
 }
 
-// CHECK-LABEL: test_vcltzq_s16
-// CHECK: sshr  {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #15
+// CHECK-LABEL: define <8 x i16> @test_vcltzq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = icmp slt <8 x i16> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VCLTZ_I]]
 uint16x8_t test_vcltzq_s16(int16x8_t a) {
   return vcltzq_s16(a);
 }
 
-// CHECK-LABEL: test_vcltzq_s32
-// CHECK: sshr  {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #31
+// CHECK-LABEL: define <4 x i32> @test_vcltzq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = icmp slt <4 x i32> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VCLTZ_I]]
 uint32x4_t test_vcltzq_s32(int32x4_t a) {
   return vcltzq_s32(a);
 }
 
-// CHECK-LABEL: test_vcltzq_s64
-// CHECK: sshr  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #63
+// CHECK-LABEL: define <2 x i64> @test_vcltzq_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = icmp slt <2 x i64> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VCLTZ_I]]
 uint64x2_t test_vcltzq_s64(int64x2_t a) {
   return vcltzq_s64(a);
 }
 
-// CHECK-LABEL: test_vcltz_f32
-// CHECK: fcmlt  {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+// CHECK-LABEL: define <2 x i32> @test_vcltz_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP2:%.*]] = fcmp olt <2 x float> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VCLTZ_I]]
 uint32x2_t test_vcltz_f32(float32x2_t a) {
   return vcltz_f32(a);
 }
  
-// CHECK-LABEL: test_vcltz_f64
-// CHECK: fcmlt  {{d[0-9]+}}, {{d[0-9]+}}, #0
+// CHECK-LABEL: define <1 x i64> @test_vcltz_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[TMP2:%.*]] = fcmp olt <1 x double> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VCLTZ_I]]
 uint64x1_t test_vcltz_f64(float64x1_t a) {
   return vcltz_f64(a);
 }
 
-// CHECK-LABEL: test_vcltzq_f32
-// CHECK: fcmlt  {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+// CHECK-LABEL: define <4 x i32> @test_vcltzq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP2:%.*]] = fcmp olt <4 x float> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VCLTZ_I]]
 uint32x4_t test_vcltzq_f32(float32x4_t a) {
   return vcltzq_f32(a);
 }
 
-// CHECK-LABEL: test_vcltzq_f64
-// CHECK: fcmlt  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+// CHECK-LABEL: define <2 x i64> @test_vcltzq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[TMP2:%.*]] = fcmp olt <2 x double> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VCLTZ_I]]
 uint64x2_t test_vcltzq_f64(float64x2_t a) {
   return vcltzq_f64(a);
 }
 
-// CHECK-LABEL: test_vrev16_s8
-// CHECK: rev16 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+// CHECK-LABEL: define <8 x i8> @test_vrev16_s8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vrev16_s8(int8x8_t a) {
   return vrev16_s8(a);
 }
 
-// CHECK-LABEL: test_vrev16_u8
-// CHECK: rev16 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+// CHECK-LABEL: define <8 x i8> @test_vrev16_u8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vrev16_u8(uint8x8_t a) {
   return vrev16_u8(a);
 }
 
-// CHECK-LABEL: test_vrev16_p8
-// CHECK: rev16 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+// CHECK-LABEL: define <8 x i8> @test_vrev16_p8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vrev16_p8(poly8x8_t a) {
   return vrev16_p8(a);
 }
 
-// CHECK-LABEL: test_vrev16q_s8
-// CHECK: rev16 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+// CHECK-LABEL: define <16 x i8> @test_vrev16q_s8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vrev16q_s8(int8x16_t a) {
   return vrev16q_s8(a);
 }
 
-// CHECK-LABEL: test_vrev16q_u8
-// CHECK: rev16 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+// CHECK-LABEL: define <16 x i8> @test_vrev16q_u8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vrev16q_u8(uint8x16_t a) {
   return vrev16q_u8(a);
 }
 
-// CHECK-LABEL: test_vrev16q_p8
-// CHECK: rev16 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+// CHECK-LABEL: define <16 x i8> @test_vrev16q_p8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vrev16q_p8(poly8x16_t a) {
   return vrev16q_p8(a);
 }
 
-// CHECK-LABEL: test_vrev32_s8
-// CHECK: rev32 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+// CHECK-LABEL: define <8 x i8> @test_vrev32_s8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vrev32_s8(int8x8_t a) {
   return vrev32_s8(a);
 }
 
-// CHECK-LABEL: test_vrev32_s16
-// CHECK: rev32 v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+// CHECK-LABEL: define <4 x i16> @test_vrev32_s16(<4 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vrev32_s16(int16x4_t a) {
   return vrev32_s16(a);
 }
 
-// CHECK-LABEL: test_vrev32_u8
-// CHECK: rev32 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+// CHECK-LABEL: define <8 x i8> @test_vrev32_u8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vrev32_u8(uint8x8_t a) {
   return vrev32_u8(a);
 }
 
-// CHECK-LABEL: test_vrev32_u16
-// CHECK: rev32 v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+// CHECK-LABEL: define <4 x i16> @test_vrev32_u16(<4 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vrev32_u16(uint16x4_t a) {
   return vrev32_u16(a);
 }
 
-// CHECK-LABEL: test_vrev32_p8
-// CHECK: rev32 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+// CHECK-LABEL: define <8 x i8> @test_vrev32_p8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vrev32_p8(poly8x8_t a) {
   return vrev32_p8(a);
 }
 
-// CHECK-LABEL: test_vrev32_p16
-// CHECK: rev32 v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+// CHECK-LABEL: define <4 x i16> @test_vrev32_p16(<4 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vrev32_p16(poly16x4_t a) {
   return vrev32_p16(a);
 }
 
-// CHECK-LABEL: test_vrev32q_s8
-// CHECK: rev32 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+// CHECK-LABEL: define <16 x i8> @test_vrev32q_s8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vrev32q_s8(int8x16_t a) {
   return vrev32q_s8(a);
 }
 
-// CHECK-LABEL: test_vrev32q_s16
-// CHECK: rev32 v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+// CHECK-LABEL: define <8 x i16> @test_vrev32q_s16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vrev32q_s16(int16x8_t a) {
   return vrev32q_s16(a);
 }
 
-// CHECK-LABEL: test_vrev32q_u8
-// CHECK: rev32 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+// CHECK-LABEL: define <16 x i8> @test_vrev32q_u8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vrev32q_u8(uint8x16_t a) {
   return vrev32q_u8(a);
 }
 
-// CHECK-LABEL: test_vrev32q_u16
-// CHECK: rev32 v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+// CHECK-LABEL: define <8 x i16> @test_vrev32q_u16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vrev32q_u16(uint16x8_t a) {
   return vrev32q_u16(a);
 }
 
-// CHECK-LABEL: test_vrev32q_p8
-// CHECK: rev32 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+// CHECK-LABEL: define <16 x i8> @test_vrev32q_p8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vrev32q_p8(poly8x16_t a) {
   return vrev32q_p8(a);
 }
 
-// CHECK-LABEL: test_vrev32q_p16
-// CHECK: rev32 v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+// CHECK-LABEL: define <8 x i16> @test_vrev32q_p16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vrev32q_p16(poly16x8_t a) {
   return vrev32q_p16(a);
 }
 
-// CHECK-LABEL: test_vrev64_s8
-// CHECK: rev64 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+// CHECK-LABEL: define <8 x i8> @test_vrev64_s8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vrev64_s8(int8x8_t a) {
   return vrev64_s8(a);
 }
 
-// CHECK-LABEL: test_vrev64_s16
-// CHECK: rev64 v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+// CHECK-LABEL: define <4 x i16> @test_vrev64_s16(<4 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vrev64_s16(int16x4_t a) {
   return vrev64_s16(a);
 }
 
-// CHECK-LABEL: test_vrev64_s32
-// CHECK: rev64 v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+// CHECK-LABEL: define <2 x i32> @test_vrev64_s32(<2 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 1, i32 0>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vrev64_s32(int32x2_t a) {
   return vrev64_s32(a);
 }
 
-// CHECK-LABEL: test_vrev64_u8
-// CHECK: rev64 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+// CHECK-LABEL: define <8 x i8> @test_vrev64_u8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vrev64_u8(uint8x8_t a) {
   return vrev64_u8(a);
 }
 
-// CHECK-LABEL: test_vrev64_u16
-// CHECK: rev64 v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+// CHECK-LABEL: define <4 x i16> @test_vrev64_u16(<4 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vrev64_u16(uint16x4_t a) {
   return vrev64_u16(a);
 }
 
-// CHECK-LABEL: test_vrev64_u32
-// CHECK: rev64 v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+// CHECK-LABEL: define <2 x i32> @test_vrev64_u32(<2 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 1, i32 0>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vrev64_u32(uint32x2_t a) {
   return vrev64_u32(a);
 }
 
-// CHECK-LABEL: test_vrev64_p8
-// CHECK: rev64 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+// CHECK-LABEL: define <8 x i8> @test_vrev64_p8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vrev64_p8(poly8x8_t a) {
   return vrev64_p8(a);
 }
 
-// CHECK-LABEL: test_vrev64_p16
-// CHECK: rev64 v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+// CHECK-LABEL: define <4 x i16> @test_vrev64_p16(<4 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vrev64_p16(poly16x4_t a) {
   return vrev64_p16(a);
 }
 
-// CHECK-LABEL: test_vrev64_f32
-// CHECK: rev64 v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+// CHECK-LABEL: define <2 x float> @test_vrev64_f32(<2 x float> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %a, <2 x i32> <i32 1, i32 0>
+// CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vrev64_f32(float32x2_t a) {
   return vrev64_f32(a);
 }
 
-// CHECK-LABEL: test_vrev64q_s8
-// CHECK: rev64 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+// CHECK-LABEL: define <16 x i8> @test_vrev64q_s8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vrev64q_s8(int8x16_t a) {
   return vrev64q_s8(a);
 }
 
-// CHECK-LABEL: test_vrev64q_s16
-// CHECK: rev64 v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+// CHECK-LABEL: define <8 x i16> @test_vrev64q_s16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vrev64q_s16(int16x8_t a) {
   return vrev64q_s16(a);
 }
 
-// CHECK-LABEL: test_vrev64q_s32
-// CHECK: rev64 v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+// CHECK-LABEL: define <4 x i32> @test_vrev64q_s32(<4 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vrev64q_s32(int32x4_t a) {
   return vrev64q_s32(a);
 }
 
-// CHECK-LABEL: test_vrev64q_u8
-// CHECK: rev64 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+// CHECK-LABEL: define <16 x i8> @test_vrev64q_u8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vrev64q_u8(uint8x16_t a) {
   return vrev64q_u8(a);
 }
 
-// CHECK-LABEL: test_vrev64q_u16
-// CHECK: rev64 v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+// CHECK-LABEL: define <8 x i16> @test_vrev64q_u16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vrev64q_u16(uint16x8_t a) {
   return vrev64q_u16(a);
 }
 
-// CHECK-LABEL: test_vrev64q_u32
-// CHECK: rev64 v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+// CHECK-LABEL: define <4 x i32> @test_vrev64q_u32(<4 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vrev64q_u32(uint32x4_t a) {
   return vrev64q_u32(a);
 }
 
-// CHECK-LABEL: test_vrev64q_p8
-// CHECK: rev64 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+// CHECK-LABEL: define <16 x i8> @test_vrev64q_p8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vrev64q_p8(poly8x16_t a) {
   return vrev64q_p8(a);
 }
 
-// CHECK-LABEL: test_vrev64q_p16
-// CHECK: rev64 v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+// CHECK-LABEL: define <8 x i16> @test_vrev64q_p16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vrev64q_p16(poly16x8_t a) {
   return vrev64q_p16(a);
 }
 
-// CHECK-LABEL: test_vrev64q_f32
-// CHECK: rev64 v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+// CHECK-LABEL: define <4 x float> @test_vrev64q_f32(<4 x float> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:   ret <4 x float> [[SHUFFLE_I]]
 float32x4_t test_vrev64q_f32(float32x4_t a) {
   return vrev64q_f32(a);
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vpaddl_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VPADDL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %a) #2
+// CHECK:   ret <4 x i16> [[VPADDL_I]]
 int16x4_t test_vpaddl_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vpaddl_s8
   return vpaddl_s8(a);
-  // CHECK: saddlp v{{[0-9]+}}.4h, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vpaddl_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> [[VPADDL_I]]) #2
+// CHECK:   ret <2 x i32> [[VPADDL1_I]]
 int32x2_t test_vpaddl_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vpaddl_s16
   return vpaddl_s16(a);
-  // CHECK: saddlp v{{[0-9]+}}.2s, v{{[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vpaddl_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> [[VPADDL_I]]) #2
+// CHECK:   ret <1 x i64> [[VPADDL1_I]]
 int64x1_t test_vpaddl_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vpaddl_s32
   return vpaddl_s32(a);
-  // CHECK: saddlp v{{[0-9]+}}.1d, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vpaddl_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VPADDL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %a) #2
+// CHECK:   ret <4 x i16> [[VPADDL_I]]
 uint16x4_t test_vpaddl_u8(uint8x8_t a) {
-  // CHECK-LABEL: test_vpaddl_u8
   return vpaddl_u8(a);
-  // CHECK: uaddlp v{{[0-9]+}}.4h, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vpaddl_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> [[VPADDL_I]]) #2
+// CHECK:   ret <2 x i32> [[VPADDL1_I]]
 uint32x2_t test_vpaddl_u16(uint16x4_t a) {
-  // CHECK-LABEL: test_vpaddl_u16
   return vpaddl_u16(a);
-  // CHECK: uaddlp v{{[0-9]+}}.2s, v{{[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vpaddl_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> [[VPADDL_I]]) #2
+// CHECK:   ret <1 x i64> [[VPADDL1_I]]
 uint64x1_t test_vpaddl_u32(uint32x2_t a) {
-  // CHECK-LABEL: test_vpaddl_u32
   return vpaddl_u32(a);
-  // CHECK: uaddlp v{{[0-9]+}}.1d, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vpaddlq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VPADDL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %a) #2
+// CHECK:   ret <8 x i16> [[VPADDL_I]]
 int16x8_t test_vpaddlq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vpaddlq_s8
   return vpaddlq_s8(a);
-  // CHECK: saddlp v{{[0-9]+}}.8h, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vpaddlq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> [[VPADDL_I]]) #2
+// CHECK:   ret <4 x i32> [[VPADDL1_I]]
 int32x4_t test_vpaddlq_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vpaddlq_s16
   return vpaddlq_s16(a);
-  // CHECK: saddlp v{{[0-9]+}}.4s, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vpaddlq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> [[VPADDL_I]]) #2
+// CHECK:   ret <2 x i64> [[VPADDL1_I]]
 int64x2_t test_vpaddlq_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vpaddlq_s32
   return vpaddlq_s32(a);
-  // CHECK: saddlp v{{[0-9]+}}.2d, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vpaddlq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VPADDL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %a) #2
+// CHECK:   ret <8 x i16> [[VPADDL_I]]
 uint16x8_t test_vpaddlq_u8(uint8x16_t a) {
-  // CHECK-LABEL: test_vpaddlq_u8
   return vpaddlq_u8(a);
-  // CHECK: uaddlp v{{[0-9]+}}.8h, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vpaddlq_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> [[VPADDL_I]]) #2
+// CHECK:   ret <4 x i32> [[VPADDL1_I]]
 uint32x4_t test_vpaddlq_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vpaddlq_u16
   return vpaddlq_u16(a);
-  // CHECK: uaddlp v{{[0-9]+}}.4s, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vpaddlq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> [[VPADDL_I]]) #2
+// CHECK:   ret <2 x i64> [[VPADDL1_I]]
 uint64x2_t test_vpaddlq_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vpaddlq_u32
   return vpaddlq_u32(a);
-  // CHECK: uaddlp v{{[0-9]+}}.2d, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vpadal_s8(<4 x i16> %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %b) #2
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = add <4 x i16> [[VPADAL_I]], [[TMP1]]
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vpadal_s8(int16x4_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vpadal_s8
   return vpadal_s8(a, b);
-  // CHECK: sadalp v{{[0-9]+}}.4h, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vpadal_s16(<2 x i32> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VPADAL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> [[VPADAL_I]]) #2
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = add <2 x i32> [[VPADAL1_I]], [[TMP2]]
+// CHECK:   ret <2 x i32> [[TMP3]]
 int32x2_t test_vpadal_s16(int32x2_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vpadal_s16
   return vpadal_s16(a, b);
-  // CHECK: sadalp v{{[0-9]+}}.2s, v{{[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vpadal_s32(<1 x i64> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VPADAL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> [[VPADAL_I]]) #2
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = add <1 x i64> [[VPADAL1_I]], [[TMP2]]
+// CHECK:   ret <1 x i64> [[TMP3]]
 int64x1_t test_vpadal_s32(int64x1_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vpadal_s32
   return vpadal_s32(a, b);
-  // CHECK: sadalp v{{[0-9]+}}.1d, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vpadal_u8(<4 x i16> %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %b) #2
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = add <4 x i16> [[VPADAL_I]], [[TMP1]]
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vpadal_u8(uint16x4_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vpadal_u8
   return vpadal_u8(a, b);
-  // CHECK: uadalp v{{[0-9]+}}.4h, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vpadal_u16(<2 x i32> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VPADAL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> [[VPADAL_I]]) #2
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = add <2 x i32> [[VPADAL1_I]], [[TMP2]]
+// CHECK:   ret <2 x i32> [[TMP3]]
 uint32x2_t test_vpadal_u16(uint32x2_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vpadal_u16
   return vpadal_u16(a, b);
-  // CHECK: uadalp v{{[0-9]+}}.2s, v{{[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vpadal_u32(<1 x i64> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VPADAL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> [[VPADAL_I]]) #2
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = add <1 x i64> [[VPADAL1_I]], [[TMP2]]
+// CHECK:   ret <1 x i64> [[TMP3]]
 uint64x1_t test_vpadal_u32(uint64x1_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vpadal_u32
   return vpadal_u32(a, b);
-  // CHECK: uadalp v{{[0-9]+}}.1d, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vpadalq_s8(<8 x i16> %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %b) #2
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = add <8 x i16> [[VPADAL_I]], [[TMP1]]
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vpadalq_s8(int16x8_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vpadalq_s8
   return vpadalq_s8(a, b);
-  // CHECK: sadalp v{{[0-9]+}}.8h, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vpadalq_s16(<4 x i32> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VPADAL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> [[VPADAL_I]]) #2
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = add <4 x i32> [[VPADAL1_I]], [[TMP2]]
+// CHECK:   ret <4 x i32> [[TMP3]]
 int32x4_t test_vpadalq_s16(int32x4_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vpadalq_s16
   return vpadalq_s16(a, b);
-  // CHECK: sadalp v{{[0-9]+}}.4s, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vpadalq_s32(<2 x i64> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VPADAL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> [[VPADAL_I]]) #2
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = add <2 x i64> [[VPADAL1_I]], [[TMP2]]
+// CHECK:   ret <2 x i64> [[TMP3]]
 int64x2_t test_vpadalq_s32(int64x2_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vpadalq_s32
   return vpadalq_s32(a, b);
-  // CHECK: sadalp v{{[0-9]+}}.2d, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vpadalq_u8(<8 x i16> %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %b) #2
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = add <8 x i16> [[VPADAL_I]], [[TMP1]]
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vpadalq_u8(uint16x8_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vpadalq_u8
   return vpadalq_u8(a, b);
-  // CHECK: uadalp v{{[0-9]+}}.8h, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vpadalq_u16(<4 x i32> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VPADAL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> [[VPADAL_I]]) #2
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = add <4 x i32> [[VPADAL1_I]], [[TMP2]]
+// CHECK:   ret <4 x i32> [[TMP3]]
 uint32x4_t test_vpadalq_u16(uint32x4_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vpadalq_u16
   return vpadalq_u16(a, b);
-  // CHECK: uadalp v{{[0-9]+}}.4s, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vpadalq_u32(<2 x i64> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VPADAL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> [[VPADAL_I]]) #2
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = add <2 x i64> [[VPADAL1_I]], [[TMP2]]
+// CHECK:   ret <2 x i64> [[TMP3]]
 uint64x2_t test_vpadalq_u32(uint64x2_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vpadalq_u32
   return vpadalq_u32(a, b);
-  // CHECK: uadalp v{{[0-9]+}}.2d, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqabs_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VQABS_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8> %a) #2
+// CHECK:   ret <8 x i8> [[VQABS_V_I]]
 int8x8_t test_vqabs_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vqabs_s8
   return vqabs_s8(a);
-  // CHECK: sqabs v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqabsq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VQABSQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8> %a) #2
+// CHECK:   ret <16 x i8> [[VQABSQ_V_I]]
 int8x16_t test_vqabsq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vqabsq_s8
   return vqabsq_s8(a);
-  // CHECK: sqabs v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqabs_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VQABS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQABS_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16> [[VQABS_V_I]]) #2
+// CHECK:   [[VQABS_V2_I:%.*]] = bitcast <4 x i16> [[VQABS_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQABS_V2_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP1]]
 int16x4_t test_vqabs_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vqabs_s16
   return vqabs_s16(a);
-  // CHECK: sqabs v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqabsq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQABSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQABSQ_V1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16> [[VQABSQ_V_I]]) #2
+// CHECK:   [[VQABSQ_V2_I:%.*]] = bitcast <8 x i16> [[VQABSQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VQABSQ_V2_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP1]]
 int16x8_t test_vqabsq_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vqabsq_s16
   return vqabsq_s16(a);
-  // CHECK: sqabs v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqabs_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VQABS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQABS_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32> [[VQABS_V_I]]) #2
+// CHECK:   [[VQABS_V2_I:%.*]] = bitcast <2 x i32> [[VQABS_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQABS_V2_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP1]]
 int32x2_t test_vqabs_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vqabs_s32
   return vqabs_s32(a);
-  // CHECK: sqabs v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqabsq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQABSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQABSQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32> [[VQABSQ_V_I]]) #2
+// CHECK:   [[VQABSQ_V2_I:%.*]] = bitcast <4 x i32> [[VQABSQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VQABSQ_V2_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP1]]
 int32x4_t test_vqabsq_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vqabsq_s32
   return vqabsq_s32(a);
-  // CHECK: sqabs v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqabsq_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQABSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQABSQ_V1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqabs.v2i64(<2 x i64> [[VQABSQ_V_I]]) #2
+// CHECK:   [[VQABSQ_V2_I:%.*]] = bitcast <2 x i64> [[VQABSQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VQABSQ_V2_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP1]]
 int64x2_t test_vqabsq_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vqabsq_s64
   return vqabsq_s64(a);
-  // CHECK: sqabs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqneg_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VQNEG_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8> %a) #2
+// CHECK:   ret <8 x i8> [[VQNEG_V_I]]
 int8x8_t test_vqneg_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vqneg_s8
   return vqneg_s8(a);
-  // CHECK: sqneg v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqnegq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VQNEGQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8> %a) #2
+// CHECK:   ret <16 x i8> [[VQNEGQ_V_I]]
 int8x16_t test_vqnegq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vqnegq_s8
   return vqnegq_s8(a);
-  // CHECK: sqneg v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqneg_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VQNEG_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQNEG_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16> [[VQNEG_V_I]]) #2
+// CHECK:   [[VQNEG_V2_I:%.*]] = bitcast <4 x i16> [[VQNEG_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQNEG_V2_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP1]]
 int16x4_t test_vqneg_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vqneg_s16
   return vqneg_s16(a);
-  // CHECK: sqneg v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqnegq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQNEGQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQNEGQ_V1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> [[VQNEGQ_V_I]]) #2
+// CHECK:   [[VQNEGQ_V2_I:%.*]] = bitcast <8 x i16> [[VQNEGQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VQNEGQ_V2_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP1]]
 int16x8_t test_vqnegq_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vqnegq_s16
   return vqnegq_s16(a);
-  // CHECK: sqneg v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqneg_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VQNEG_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQNEG_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32> [[VQNEG_V_I]]) #2
+// CHECK:   [[VQNEG_V2_I:%.*]] = bitcast <2 x i32> [[VQNEG_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQNEG_V2_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP1]]
 int32x2_t test_vqneg_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vqneg_s32
   return vqneg_s32(a);
-  // CHECK: sqneg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqnegq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQNEGQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQNEGQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32> [[VQNEGQ_V_I]]) #2
+// CHECK:   [[VQNEGQ_V2_I:%.*]] = bitcast <4 x i32> [[VQNEGQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VQNEGQ_V2_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP1]]
 int32x4_t test_vqnegq_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vqnegq_s32
   return vqnegq_s32(a);
-  // CHECK: sqneg v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqnegq_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQNEGQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQNEGQ_V1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqneg.v2i64(<2 x i64> [[VQNEGQ_V_I]]) #2
+// CHECK:   [[VQNEGQ_V2_I:%.*]] = bitcast <2 x i64> [[VQNEGQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VQNEGQ_V2_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP1]]
 int64x2_t test_vqnegq_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vqnegq_s64
   return vqnegq_s64(a);
-  // CHECK: sqneg v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vneg_s8(<8 x i8> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i8> zeroinitializer, %a
+// CHECK:   ret <8 x i8> [[SUB_I]]
 int8x8_t test_vneg_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vneg_s8
   return vneg_s8(a);
-  // CHECK: neg v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vnegq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <16 x i8> zeroinitializer, %a
+// CHECK:   ret <16 x i8> [[SUB_I]]
 int8x16_t test_vnegq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vnegq_s8
   return vnegq_s8(a);
-  // CHECK: neg v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vneg_s16(<4 x i16> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> zeroinitializer, %a
+// CHECK:   ret <4 x i16> [[SUB_I]]
 int16x4_t test_vneg_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vneg_s16
   return vneg_s16(a);
-  // CHECK: neg v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vnegq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> zeroinitializer, %a
+// CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vnegq_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vnegq_s16
   return vnegq_s16(a);
-  // CHECK: neg v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vneg_s32(<2 x i32> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> zeroinitializer, %a
+// CHECK:   ret <2 x i32> [[SUB_I]]
 int32x2_t test_vneg_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vneg_s32
   return vneg_s32(a);
-  // CHECK: neg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vnegq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> zeroinitializer, %a
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vnegq_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vnegq_s32
   return vnegq_s32(a);
-  // CHECK: neg v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vnegq_s64(<2 x i64> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> zeroinitializer, %a
+// CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vnegq_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vnegq_s64
   return vnegq_s64(a);
-  // CHECK: neg v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x float> @test_vneg_f32(<2 x float> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %a
+// CHECK:   ret <2 x float> [[SUB_I]]
 float32x2_t test_vneg_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vneg_f32
   return vneg_f32(a);
-  // CHECK: fneg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vnegq_f32(<4 x float> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
+// CHECK:   ret <4 x float> [[SUB_I]]
 float32x4_t test_vnegq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vnegq_f32
   return vnegq_f32(a);
-  // CHECK: fneg v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vnegq_f64(<2 x double> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a
+// CHECK:   ret <2 x double> [[SUB_I]]
 float64x2_t test_vnegq_f64(float64x2_t a) {
-  // CHECK-LABEL: test_vnegq_f64
   return vnegq_f64(a);
-  // CHECK: fneg v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vabs_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VABS_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8> %a) #2
+// CHECK:   ret <8 x i8> [[VABS_I]]
 int8x8_t test_vabs_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vabs_s8
   return vabs_s8(a);
-  // CHECK: abs v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vabsq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VABS_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8> %a) #2
+// CHECK:   ret <16 x i8> [[VABS_I]]
 int8x16_t test_vabsq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vabsq_s8
   return vabsq_s8(a);
-  // CHECK: abs v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vabs_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VABS1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16> [[VABS_I]]) #2
+// CHECK:   ret <4 x i16> [[VABS1_I]]
 int16x4_t test_vabs_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vabs_s16
   return vabs_s16(a);
-  // CHECK: abs v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vabsq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VABS1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16> [[VABS_I]]) #2
+// CHECK:   ret <8 x i16> [[VABS1_I]]
 int16x8_t test_vabsq_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vabsq_s16
   return vabsq_s16(a);
-  // CHECK: abs v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vabs_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VABS1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32> [[VABS_I]]) #2
+// CHECK:   ret <2 x i32> [[VABS1_I]]
 int32x2_t test_vabs_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vabs_s32
   return vabs_s32(a);
-  // CHECK: abs v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vabsq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VABS1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32> [[VABS_I]]) #2
+// CHECK:   ret <4 x i32> [[VABS1_I]]
 int32x4_t test_vabsq_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vabsq_s32
   return vabsq_s32(a);
-  // CHECK: abs v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vabsq_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VABS1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.abs.v2i64(<2 x i64> [[VABS_I]]) #2
+// CHECK:   ret <2 x i64> [[VABS1_I]]
 int64x2_t test_vabsq_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vabsq_s64
   return vabsq_s64(a);
-  // CHECK: abs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x float> @test_vabs_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VABS1_I:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> [[VABS_I]]) #2
+// CHECK:   ret <2 x float> [[VABS1_I]]
 float32x2_t test_vabs_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vabs_f32
   return vabs_f32(a);
-  // CHECK: fabs v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vabsq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VABS1_I:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[VABS_I]]) #2
+// CHECK:   ret <4 x float> [[VABS1_I]]
 float32x4_t test_vabsq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vabsq_f32
   return vabsq_f32(a);
-  // CHECK: fabs v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vabsq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VABS1_I:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[VABS_I]]) #2
+// CHECK:   ret <2 x double> [[VABS1_I]]
 float64x2_t test_vabsq_f64(float64x2_t a) {
-  // CHECK-LABEL: test_vabsq_f64
   return vabsq_f64(a);
-  // CHECK: fabs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vuqadd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VUQADD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.suqadd.v8i8(<8 x i8> %a, <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VUQADD_I]]
 int8x8_t test_vuqadd_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vuqadd_s8
   return vuqadd_s8(a, b);
-  // CHECK: suqadd v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vuqaddq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VUQADD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.suqadd.v16i8(<16 x i8> %a, <16 x i8> %b) #2
+// CHECK:   ret <16 x i8> [[VUQADD_I]]
 int8x16_t test_vuqaddq_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vuqaddq_s8
   return vuqaddq_s8(a, b);
-  // CHECK: suqadd v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vuqadd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VUQADD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VUQADD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VUQADD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16> [[VUQADD_I]], <4 x i16> [[VUQADD1_I]]) #2
+// CHECK:   ret <4 x i16> [[VUQADD2_I]]
 int16x4_t test_vuqadd_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vuqadd_s16
   return vuqadd_s16(a, b);
-  // CHECK: suqadd v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vuqaddq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VUQADD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VUQADD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VUQADD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.suqadd.v8i16(<8 x i16> [[VUQADD_I]], <8 x i16> [[VUQADD1_I]]) #2
+// CHECK:   ret <8 x i16> [[VUQADD2_I]]
 int16x8_t test_vuqaddq_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vuqaddq_s16
   return vuqaddq_s16(a, b);
-  // CHECK: suqadd v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vuqadd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VUQADD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VUQADD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VUQADD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.suqadd.v2i32(<2 x i32> [[VUQADD_I]], <2 x i32> [[VUQADD1_I]]) #2
+// CHECK:   ret <2 x i32> [[VUQADD2_I]]
 int32x2_t test_vuqadd_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vuqadd_s32
   return vuqadd_s32(a, b);
-  // CHECK: suqadd v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vuqaddq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VUQADD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VUQADD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VUQADD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.suqadd.v4i32(<4 x i32> [[VUQADD_I]], <4 x i32> [[VUQADD1_I]]) #2
+// CHECK:   ret <4 x i32> [[VUQADD2_I]]
 int32x4_t test_vuqaddq_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vuqaddq_s32
   return vuqaddq_s32(a, b);
-  // CHECK: suqadd v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vuqaddq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VUQADD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VUQADD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VUQADD2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.suqadd.v2i64(<2 x i64> [[VUQADD_I]], <2 x i64> [[VUQADD1_I]]) #2
+// CHECK:   ret <2 x i64> [[VUQADD2_I]]
 int64x2_t test_vuqaddq_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vuqaddq_s64
   return vuqaddq_s64(a, b);
-  // CHECK: suqadd v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vcls_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VCLS_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.cls.v8i8(<8 x i8> %a) #2
+// CHECK:   ret <8 x i8> [[VCLS_V_I]]
 int8x8_t test_vcls_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vcls_s8
   return vcls_s8(a);
-  // CHECK: cls v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vclsq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VCLSQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.cls.v16i8(<16 x i8> %a) #2
+// CHECK:   ret <16 x i8> [[VCLSQ_V_I]]
 int8x16_t test_vclsq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vclsq_s8
   return vclsq_s8(a);
-  // CHECK: cls v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vcls_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VCLS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VCLS_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.cls.v4i16(<4 x i16> [[VCLS_V_I]]) #2
+// CHECK:   [[VCLS_V2_I:%.*]] = bitcast <4 x i16> [[VCLS_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLS_V2_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP1]]
 int16x4_t test_vcls_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vcls_s16
   return vcls_s16(a);
-  // CHECK: cls v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vclsq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VCLSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VCLSQ_V1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.cls.v8i16(<8 x i16> [[VCLSQ_V_I]]) #2
+// CHECK:   [[VCLSQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLSQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLSQ_V2_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP1]]
 int16x8_t test_vclsq_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vclsq_s16
   return vclsq_s16(a);
-  // CHECK: cls v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcls_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VCLS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VCLS_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.cls.v2i32(<2 x i32> [[VCLS_V_I]]) #2
+// CHECK:   [[VCLS_V2_I:%.*]] = bitcast <2 x i32> [[VCLS_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLS_V2_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP1]]
 int32x2_t test_vcls_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vcls_s32
   return vcls_s32(a);
-  // CHECK: cls v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vclsq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VCLSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VCLSQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.cls.v4i32(<4 x i32> [[VCLSQ_V_I]]) #2
+// CHECK:   [[VCLSQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLSQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLSQ_V2_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP1]]
 int32x4_t test_vclsq_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vclsq_s32
   return vclsq_s32(a);
-  // CHECK: cls v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vclz_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) #2
+// CHECK:   ret <8 x i8> [[VCLZ_V_I]]
 int8x8_t test_vclz_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vclz_s8
   return vclz_s8(a);
-  // CHECK: clz v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vclzq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VCLZQ_V_I:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) #2
+// CHECK:   ret <16 x i8> [[VCLZQ_V_I]]
 int8x16_t test_vclzq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vclzq_s8
   return vclzq_s8(a);
-  // CHECK: clz v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vclz_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[VCLZ_V_I]], i1 false) #2
+// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP1]]
 int16x4_t test_vclz_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vclz_s16
   return vclz_s16(a);
-  // CHECK: clz v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vclzq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[VCLZQ_V_I]], i1 false) #2
+// CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLZQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP1]]
 int16x8_t test_vclzq_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vclzq_s16
   return vclzq_s16(a);
-  // CHECK: clz v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vclz_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[VCLZ_V_I]], i1 false) #2
+// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP1]]
 int32x2_t test_vclz_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vclz_s32
   return vclz_s32(a);
-  // CHECK: clz v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vclzq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[VCLZQ_V_I]], i1 false) #2
+// CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLZQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP1]]
 int32x4_t test_vclzq_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vclzq_s32
   return vclzq_s32(a);
-  // CHECK: clz v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vclz_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) #2
+// CHECK:   ret <8 x i8> [[VCLZ_V_I]]
 uint8x8_t test_vclz_u8(uint8x8_t a) {
-  // CHECK-LABEL: test_vclz_u8
   return vclz_u8(a);
-  // CHECK: clz v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vclzq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VCLZQ_V_I:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) #2
+// CHECK:   ret <16 x i8> [[VCLZQ_V_I]]
 uint8x16_t test_vclzq_u8(uint8x16_t a) {
-  // CHECK-LABEL: test_vclzq_u8
   return vclzq_u8(a);
-  // CHECK: clz v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vclz_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[VCLZ_V_I]], i1 false) #2
+// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP1]]
 uint16x4_t test_vclz_u16(uint16x4_t a) {
-  // CHECK-LABEL: test_vclz_u16
   return vclz_u16(a);
-  // CHECK: clz v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vclzq_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[VCLZQ_V_I]], i1 false) #2
+// CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLZQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP1]]
 uint16x8_t test_vclzq_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vclzq_u16
   return vclzq_u16(a);
-  // CHECK: clz v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vclz_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[VCLZ_V_I]], i1 false) #2
+// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP1]]
 uint32x2_t test_vclz_u32(uint32x2_t a) {
-  // CHECK-LABEL: test_vclz_u32
   return vclz_u32(a);
-  // CHECK: clz v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vclzq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[VCLZQ_V_I]], i1 false) #2
+// CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLZQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP1]]
 uint32x4_t test_vclzq_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vclzq_u32
   return vclzq_u32(a);
-  // CHECK: clz v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vcnt_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) #2
+// CHECK:   ret <8 x i8> [[VCNT_V_I]]
 int8x8_t test_vcnt_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vcnt_s8
   return vcnt_s8(a);
-  // CHECK: cnt v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vcntq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) #2
+// CHECK:   ret <16 x i8> [[VCNTQ_V_I]]
 int8x16_t test_vcntq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vcntq_s8
   return vcntq_s8(a);
-  // CHECK: cnt v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vcnt_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) #2
+// CHECK:   ret <8 x i8> [[VCNT_V_I]]
 uint8x8_t test_vcnt_u8(uint8x8_t a) {
-  // CHECK-LABEL: test_vcnt_u8
   return vcnt_u8(a);
-  // CHECK: cnt v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vcntq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) #2
+// CHECK:   ret <16 x i8> [[VCNTQ_V_I]]
 uint8x16_t test_vcntq_u8(uint8x16_t a) {
-  // CHECK-LABEL: test_vcntq_u8
   return vcntq_u8(a);
-  // CHECK: cnt v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vcnt_p8(<8 x i8> %a) #0 {
+// CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) #2
+// CHECK:   ret <8 x i8> [[VCNT_V_I]]
 poly8x8_t test_vcnt_p8(poly8x8_t a) {
-  // CHECK-LABEL: test_vcnt_p8
   return vcnt_p8(a);
-  // CHECK: cnt v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vcntq_p8(<16 x i8> %a) #0 {
+// CHECK:   [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) #2
+// CHECK:   ret <16 x i8> [[VCNTQ_V_I]]
 poly8x16_t test_vcntq_p8(poly8x16_t a) {
-  // CHECK-LABEL: test_vcntq_p8
   return vcntq_p8(a);
-  // CHECK: cnt v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vmvn_s8(<8 x i8> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   ret <8 x i8> [[NEG_I]]
 int8x8_t test_vmvn_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vmvn_s8
   return vmvn_s8(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vmvnq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   ret <16 x i8> [[NEG_I]]
 int8x16_t test_vmvnq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vmvnq_s8
   return vmvnq_s8(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmvn_s16(<4 x i16> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   ret <4 x i16> [[NEG_I]]
 int16x4_t test_vmvn_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vmvn_s16
   return vmvn_s16(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmvnq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   ret <8 x i16> [[NEG_I]]
 int16x8_t test_vmvnq_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vmvnq_s16
   return vmvnq_s16(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmvn_s32(<2 x i32> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %a, <i32 -1, i32 -1>
+// CHECK:   ret <2 x i32> [[NEG_I]]
 int32x2_t test_vmvn_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vmvn_s32
   return vmvn_s32(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmvnq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   ret <4 x i32> [[NEG_I]]
 int32x4_t test_vmvnq_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vmvnq_s32
   return vmvnq_s32(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vmvn_u8(<8 x i8> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   ret <8 x i8> [[NEG_I]]
 uint8x8_t test_vmvn_u8(uint8x8_t a) {
-  // CHECK-LABEL: test_vmvn_u8
   return vmvn_u8(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vmvnq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   ret <16 x i8> [[NEG_I]]
 uint8x16_t test_vmvnq_u8(uint8x16_t a) {
-  // CHECK-LABEL: test_vmvnq_u8
   return vmvnq_u8(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmvn_u16(<4 x i16> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   ret <4 x i16> [[NEG_I]]
 uint16x4_t test_vmvn_u16(uint16x4_t a) {
-  // CHECK-LABEL: test_vmvn_u16
   return vmvn_u16(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmvnq_u16(<8 x i16> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   ret <8 x i16> [[NEG_I]]
 uint16x8_t test_vmvnq_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vmvnq_u16
   return vmvnq_u16(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmvn_u32(<2 x i32> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %a, <i32 -1, i32 -1>
+// CHECK:   ret <2 x i32> [[NEG_I]]
 uint32x2_t test_vmvn_u32(uint32x2_t a) {
-  // CHECK-LABEL: test_vmvn_u32
   return vmvn_u32(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmvnq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   ret <4 x i32> [[NEG_I]]
 uint32x4_t test_vmvnq_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vmvnq_u32
   return vmvnq_u32(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vmvn_p8(<8 x i8> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   ret <8 x i8> [[NEG_I]]
 poly8x8_t test_vmvn_p8(poly8x8_t a) {
-  // CHECK-LABEL: test_vmvn_p8
   return vmvn_p8(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vmvnq_p8(<16 x i8> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   ret <16 x i8> [[NEG_I]]
 poly8x16_t test_vmvnq_p8(poly8x16_t a) {
-  // CHECK-LABEL: test_vmvnq_p8
   return vmvnq_p8(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vrbit_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VRBIT_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rbit.v8i8(<8 x i8> %a) #2
+// CHECK:   ret <8 x i8> [[VRBIT_I]]
 int8x8_t test_vrbit_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vrbit_s8
   return vrbit_s8(a);
-  // CHECK: rbit v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vrbitq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VRBIT_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8> %a) #2
+// CHECK:   ret <16 x i8> [[VRBIT_I]]
 int8x16_t test_vrbitq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vrbitq_s8
   return vrbitq_s8(a);
-  // CHECK: rbit v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vrbit_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VRBIT_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rbit.v8i8(<8 x i8> %a) #2
+// CHECK:   ret <8 x i8> [[VRBIT_I]]
 uint8x8_t test_vrbit_u8(uint8x8_t a) {
-  // CHECK-LABEL: test_vrbit_u8
   return vrbit_u8(a);
-  // CHECK: rbit v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vrbitq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VRBIT_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8> %a) #2
+// CHECK:   ret <16 x i8> [[VRBIT_I]]
 uint8x16_t test_vrbitq_u8(uint8x16_t a) {
-  // CHECK-LABEL: test_vrbitq_u8
   return vrbitq_u8(a);
-  // CHECK: rbit v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vrbit_p8(<8 x i8> %a) #0 {
+// CHECK:   [[VRBIT_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rbit.v8i8(<8 x i8> %a) #2
+// CHECK:   ret <8 x i8> [[VRBIT_I]]
 poly8x8_t test_vrbit_p8(poly8x8_t a) {
-  // CHECK-LABEL: test_vrbit_p8
   return vrbit_p8(a);
-  // CHECK: rbit v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vrbitq_p8(<16 x i8> %a) #0 {
+// CHECK:   [[VRBIT_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8> %a) #2
+// CHECK:   ret <16 x i8> [[VRBIT_I]]
 poly8x16_t test_vrbitq_p8(poly8x16_t a) {
-  // CHECK-LABEL: test_vrbitq_p8
   return vrbitq_p8(a);
-  // CHECK: rbit v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vmovn_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <8 x i16> [[TMP1]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VMOVN_I]]
 int8x8_t test_vmovn_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vmovn_s16
   return vmovn_s16(a);
-  // CHECK: xtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmovn_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <4 x i32> [[TMP1]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VMOVN_I]]
 int16x4_t test_vmovn_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vmovn_s32
   return vmovn_s32(a);
-  // CHECK: xtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmovn_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <2 x i64> [[TMP1]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VMOVN_I]]
 int32x2_t test_vmovn_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vmovn_s64
   return vmovn_s64(a);
-  // CHECK: xtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vmovn_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <8 x i16> [[TMP1]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VMOVN_I]]
 uint8x8_t test_vmovn_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vmovn_u16
   return vmovn_u16(a);
-  // CHECK: xtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmovn_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <4 x i32> [[TMP1]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VMOVN_I]]
 uint16x4_t test_vmovn_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vmovn_u32
   return vmovn_u32(a);
-  // CHECK: xtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmovn_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <2 x i64> [[TMP1]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VMOVN_I]]
 uint32x2_t test_vmovn_u64(uint64x2_t a) {
-  // CHECK-LABEL: test_vmovn_u64
   return vmovn_u64(a);
-  // CHECK: xtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vmovn_high_s16(<8 x i8> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VMOVN_I_I:%.*]] = trunc <8 x i16> [[TMP1]] to <8 x i8>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VMOVN_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
 int8x16_t test_vmovn_high_s16(int8x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vmovn_high_s16
   return vmovn_high_s16(a, b);
-  // CHECK: xtn2 v{{[0-9]+}}.16b, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmovn_high_s32(<4 x i16> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VMOVN_I_I:%.*]] = trunc <4 x i32> [[TMP1]] to <4 x i16>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VMOVN_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
 int16x8_t test_vmovn_high_s32(int16x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vmovn_high_s32
   return vmovn_high_s32(a, b);
-  // CHECK: xtn2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmovn_high_s64(<2 x i32> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VMOVN_I_I:%.*]] = trunc <2 x i64> [[TMP1]] to <2 x i32>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VMOVN_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
 int32x4_t test_vmovn_high_s64(int32x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vmovn_high_s64
   return vmovn_high_s64(a, b);
-  // CHECK: xtn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vmovn_high_u16(<8 x i8> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VMOVN_I_I:%.*]] = trunc <8 x i16> [[TMP1]] to <8 x i8>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VMOVN_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
 int8x16_t test_vmovn_high_u16(int8x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vmovn_high_u16
   return vmovn_high_u16(a, b);
-  // CHECK: xtn2 v{{[0-9]+}}.16b, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmovn_high_u32(<4 x i16> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VMOVN_I_I:%.*]] = trunc <4 x i32> [[TMP1]] to <4 x i16>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VMOVN_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
 int16x8_t test_vmovn_high_u32(int16x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vmovn_high_u32
   return vmovn_high_u32(a, b);
-  // CHECK: xtn2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmovn_high_u64(<2 x i32> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VMOVN_I_I:%.*]] = trunc <2 x i64> [[TMP1]] to <2 x i32>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VMOVN_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
 int32x4_t test_vmovn_high_u64(int32x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vmovn_high_u64
   return vmovn_high_u64(a, b);
-  // CHECK: xtn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqmovun_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQMOVUN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQMOVUN_V1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> [[VQMOVUN_V_I]]) #2
+// CHECK:   ret <8 x i8> [[VQMOVUN_V1_I]]
 int8x8_t test_vqmovun_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vqmovun_s16
   return vqmovun_s16(a);
-  // CHECK: sqxtun v{{[0-9]+}}.8b, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqmovun_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQMOVUN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQMOVUN_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> [[VQMOVUN_V_I]]) #2
+// CHECK:   [[VQMOVUN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVUN_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVUN_V2_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP1]]
 int16x4_t test_vqmovun_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vqmovun_s32
   return vqmovun_s32(a);
-  // CHECK: sqxtun v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqmovun_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQMOVUN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQMOVUN_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64> [[VQMOVUN_V_I]]) #2
+// CHECK:   [[VQMOVUN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVUN_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVUN_V2_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP1]]
 int32x2_t test_vqmovun_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vqmovun_s64
   return vqmovun_s64(a);
-  // CHECK: sqxtun v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqmovun_high_s16(<8 x i8> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQMOVUN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQMOVUN_V1_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> [[VQMOVUN_V_I_I]]) #2
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQMOVUN_V1_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
 int8x16_t test_vqmovun_high_s16(int8x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vqmovun_high_s16
   return vqmovun_high_s16(a, b);
-  // CHECK: sqxtun2 v{{[0-9]+}}.16b, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqmovun_high_s32(<4 x i16> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQMOVUN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQMOVUN_V1_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> [[VQMOVUN_V_I_I]]) #2
+// CHECK:   [[VQMOVUN_V2_I_I:%.*]] = bitcast <4 x i16> [[VQMOVUN_V1_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVUN_V2_I_I]] to <4 x i16>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
 int16x8_t test_vqmovun_high_s32(int16x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vqmovun_high_s32
   return vqmovun_high_s32(a, b);
-  // CHECK: sqxtun2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqmovun_high_s64(<2 x i32> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQMOVUN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQMOVUN_V1_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64> [[VQMOVUN_V_I_I]]) #2
+// CHECK:   [[VQMOVUN_V2_I_I:%.*]] = bitcast <2 x i32> [[VQMOVUN_V1_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVUN_V2_I_I]] to <2 x i32>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
 int32x4_t test_vqmovun_high_s64(int32x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vqmovun_high_s64
   return vqmovun_high_s64(a, b);
-  // CHECK: sqxtun2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqmovn_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> [[VQMOVN_V_I]]) #2
+// CHECK:   ret <8 x i8> [[VQMOVN_V1_I]]
 int8x8_t test_vqmovn_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vqmovn_s16
   return vqmovn_s16(a);
-  // CHECK: sqxtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqmovn_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> [[VQMOVN_V_I]]) #2
+// CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP1]]
 int16x4_t test_vqmovn_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vqmovn_s32
   return vqmovn_s32(a);
-  // CHECK: sqxtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqmovn_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64> [[VQMOVN_V_I]]) #2
+// CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP1]]
 int32x2_t test_vqmovn_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vqmovn_s64
   return vqmovn_s64(a);
-  // CHECK: sqxtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqmovn_high_s16(<8 x i8> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQMOVN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQMOVN_V1_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> [[VQMOVN_V_I_I]]) #2
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQMOVN_V1_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
 int8x16_t test_vqmovn_high_s16(int8x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vqmovn_high_s16
   return vqmovn_high_s16(a, b);
-  // CHECK: sqxtn2 v{{[0-9]+}}.16b, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqmovn_high_s32(<4 x i16> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQMOVN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQMOVN_V1_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> [[VQMOVN_V_I_I]]) #2
+// CHECK:   [[VQMOVN_V2_I_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I_I]] to <4 x i16>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
 int16x8_t test_vqmovn_high_s32(int16x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vqmovn_high_s32
   return vqmovn_high_s32(a, b);
-  // CHECK: sqxtn2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqmovn_high_s64(<2 x i32> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQMOVN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQMOVN_V1_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64> [[VQMOVN_V_I_I]]) #2
+// CHECK:   [[VQMOVN_V2_I_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I_I]] to <2 x i32>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
 int32x4_t test_vqmovn_high_s64(int32x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vqmovn_high_s64
   return vqmovn_high_s64(a, b);
-  // CHECK: sqxtn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqmovn_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> [[VQMOVN_V_I]]) #2
+// CHECK:   ret <8 x i8> [[VQMOVN_V1_I]]
 uint8x8_t test_vqmovn_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vqmovn_u16
   return vqmovn_u16(a);
-  // CHECK: uqxtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqmovn_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> [[VQMOVN_V_I]]) #2
+// CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP1]]
 uint16x4_t test_vqmovn_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vqmovn_u32
   return vqmovn_u32(a);
-  // CHECK: uqxtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqmovn_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64> [[VQMOVN_V_I]]) #2
+// CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP1]]
 uint32x2_t test_vqmovn_u64(uint64x2_t a) {
-  // CHECK-LABEL: test_vqmovn_u64
   return vqmovn_u64(a);
-  // CHECK: uqxtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqmovn_high_u16(<8 x i8> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQMOVN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQMOVN_V1_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> [[VQMOVN_V_I_I]]) #2
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQMOVN_V1_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
 uint8x16_t test_vqmovn_high_u16(uint8x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vqmovn_high_u16
   return vqmovn_high_u16(a, b);
-  // CHECK: uqxtn2 v{{[0-9]+}}.16b, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqmovn_high_u32(<4 x i16> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQMOVN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQMOVN_V1_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> [[VQMOVN_V_I_I]]) #2
+// CHECK:   [[VQMOVN_V2_I_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I_I]] to <4 x i16>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
 uint16x8_t test_vqmovn_high_u32(uint16x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vqmovn_high_u32
   return vqmovn_high_u32(a, b);
-  // CHECK: uqxtn2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqmovn_high_u64(<2 x i32> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQMOVN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQMOVN_V1_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64> [[VQMOVN_V_I_I]]) #2
+// CHECK:   [[VQMOVN_V2_I_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I_I]] to <2 x i32>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
 uint32x4_t test_vqmovn_high_u64(uint32x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vqmovn_high_u64
   return vqmovn_high_u64(a, b);
-  // CHECK: uqxtn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vshll_n_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = sext <8 x i8> %a to <8 x i16>
+// CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK:   ret <8 x i16> [[VSHLL_N]]
 int16x8_t test_vshll_n_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vshll_n_s8
   return vshll_n_s8(a, 8);
-  // CHECK: shll {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, #8
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vshll_n_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK:   ret <4 x i32> [[VSHLL_N]]
 int32x4_t test_vshll_n_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vshll_n_s16
   return vshll_n_s16(a, 16);
-  // CHECK: shll {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, #16
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vshll_n_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], <i64 32, i64 32>
+// CHECK:   ret <2 x i64> [[VSHLL_N]]
 int64x2_t test_vshll_n_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vshll_n_s32
   return vshll_n_s32(a, 32);
-  // CHECK: shll {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, #32
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vshll_n_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = zext <8 x i8> %a to <8 x i16>
+// CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK:   ret <8 x i16> [[VSHLL_N]]
 uint16x8_t test_vshll_n_u8(uint8x8_t a) {
-  // CHECK-LABEL: test_vshll_n_u8
   return vshll_n_u8(a, 8);
-  // CHECK: shll {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, #8
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vshll_n_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK:   ret <4 x i32> [[VSHLL_N]]
 uint32x4_t test_vshll_n_u16(uint16x4_t a) {
-  // CHECK-LABEL: test_vshll_n_u16
   return vshll_n_u16(a, 16);
-  // CHECK: shll {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, #16
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vshll_n_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], <i64 32, i64 32>
+// CHECK:   ret <2 x i64> [[VSHLL_N]]
 uint64x2_t test_vshll_n_u32(uint32x2_t a) {
-  // CHECK-LABEL: test_vshll_n_u32
   return vshll_n_u32(a, 32);
-  // CHECK: shll {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, #32
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vshll_high_n_s8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I]] to <8 x i16>
+// CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK:   ret <8 x i16> [[VSHLL_N]]
 int16x8_t test_vshll_high_n_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vshll_high_n_s8
   return vshll_high_n_s8(a, 8);
-  // CHECK: shll2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, #8
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vshll_high_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK:   ret <4 x i32> [[VSHLL_N]]
 int32x4_t test_vshll_high_n_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vshll_high_n_s16
   return vshll_high_n_s16(a, 16);
-  // CHECK: shll2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, #16
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vshll_high_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], <i64 32, i64 32>
+// CHECK:   ret <2 x i64> [[VSHLL_N]]
 int64x2_t test_vshll_high_n_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vshll_high_n_s32
   return vshll_high_n_s32(a, 32);
-  // CHECK: shll2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, #32
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vshll_high_n_u8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I]] to <8 x i16>
+// CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK:   ret <8 x i16> [[VSHLL_N]]
 uint16x8_t test_vshll_high_n_u8(uint8x16_t a) {
-  // CHECK-LABEL: test_vshll_high_n_u8
   return vshll_high_n_u8(a, 8);
-  // CHECK: shll2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, #8
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vshll_high_n_u16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK:   ret <4 x i32> [[VSHLL_N]]
 uint32x4_t test_vshll_high_n_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vshll_high_n_u16
   return vshll_high_n_u16(a, 16);
-  // CHECK: shll2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, #16
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vshll_high_n_u32(<4 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], <i64 32, i64 32>
+// CHECK:   ret <2 x i64> [[VSHLL_N]]
 uint64x2_t test_vshll_high_n_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vshll_high_n_u32
   return vshll_high_n_u32(a, 32);
-  // CHECK: shll2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, #32
 }
 
+// CHECK-LABEL: define <4 x half> @test_vcvt_f16_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVT_F16_F32_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVT_F16_F321_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float> [[VCVT_F16_F32_I]]) #2
+// CHECK:   [[VCVT_F16_F322_I:%.*]] = bitcast <4 x i16> [[VCVT_F16_F321_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCVT_F16_F322_I]] to <4 x half>
+// CHECK:   ret <4 x half> [[TMP1]]
 float16x4_t test_vcvt_f16_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vcvt_f16_f32
   return vcvt_f16_f32(a);
-  // CHECK: fcvtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <8 x half> @test_vcvt_high_f16_f32(<4 x half> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VCVT_F16_F32_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVT_F16_F321_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float> [[VCVT_F16_F32_I_I]]) #2
+// CHECK:   [[VCVT_F16_F322_I_I:%.*]] = bitcast <4 x i16> [[VCVT_F16_F321_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCVT_F16_F322_I_I]] to <4 x half>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x half> %a, <4 x half> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x half> [[SHUFFLE_I_I]]
 float16x8_t test_vcvt_high_f16_f32(float16x4_t a, float32x4_t b) {
-  //CHECK-LABEL: test_vcvt_high_f16_f32
   return vcvt_high_f16_f32(a, b);
-  // CHECK: fcvtn2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x float> @test_vcvt_f32_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VCVT_I:%.*]] = fptrunc <2 x double> [[TMP1]] to <2 x float>
+// CHECK:   ret <2 x float> [[VCVT_I]]
 float32x2_t test_vcvt_f32_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vcvt_f32_f64
   return vcvt_f32_f64(a);
-  // CHECK: fcvtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <4 x float> @test_vcvt_high_f32_f64(<2 x float> %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VCVT_I_I:%.*]] = fptrunc <2 x double> [[TMP1]] to <2 x float>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x float> %a, <2 x float> [[VCVT_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x float> [[SHUFFLE_I_I]]
 float32x4_t test_vcvt_high_f32_f64(float32x2_t a, float64x2_t b) {
-  //CHECK-LABEL: test_vcvt_high_f32_f64
   return vcvt_high_f32_f64(a, b);
-  // CHECK: fcvtn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x float> @test_vcvtx_f32_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VCVTX_F32_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VCVTX_F32_V1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> [[VCVTX_F32_V_I]]) #2
+// CHECK:   ret <2 x float> [[VCVTX_F32_V1_I]]
 float32x2_t test_vcvtx_f32_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vcvtx_f32_f64
   return vcvtx_f32_f64(a);
-  // CHECK: fcvtxn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <4 x float> @test_vcvtx_high_f32_f64(<2 x float> %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[VCVTX_F32_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VCVTX_F32_V1_I_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> [[VCVTX_F32_V_I_I]]) #2
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x float> %a, <2 x float> [[VCVTX_F32_V1_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x float> [[SHUFFLE_I_I]]
 float32x4_t test_vcvtx_high_f32_f64(float32x2_t a, float64x2_t b) {
-  //CHECK-LABEL: test_vcvtx_high_f32_f64
   return vcvtx_high_f32_f64(a, b);
-  // CHECK: fcvtxn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <4 x float> @test_vcvt_f32_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
+// CHECK:   [[VCVT_F32_F16_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VCVT_F32_F161_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvthf2fp(<4 x i16> [[VCVT_F32_F16_I]]) #2
+// CHECK:   [[VCVT_F32_F162_I:%.*]] = bitcast <4 x float> [[VCVT_F32_F161_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCVT_F32_F162_I]] to <4 x float>
+// CHECK:   ret <4 x float> [[TMP1]]
 float32x4_t test_vcvt_f32_f16(float16x4_t a) {
-  //CHECK-LABEL: test_vcvt_f32_f16
   return vcvt_f32_f16(a);
-  // CHECK: fcvtl v{{[0-9]+}}.4s, v{{[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <4 x float> @test_vcvt_high_f32_f16(<8 x half> %a) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[VCVT_F32_F16_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VCVT_F32_F161_I_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvthf2fp(<4 x i16> [[VCVT_F32_F16_I_I]]) #2
+// CHECK:   [[VCVT_F32_F162_I_I:%.*]] = bitcast <4 x float> [[VCVT_F32_F161_I_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCVT_F32_F162_I_I]] to <4 x float>
+// CHECK:   ret <4 x float> [[TMP1]]
 float32x4_t test_vcvt_high_f32_f16(float16x8_t a) {
-  //CHECK-LABEL: test_vcvt_high_f32_f16
   return vcvt_high_f32_f16(a);
-  // CHECK: fcvtl2 v{{[0-9]+}}.4s, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x double> @test_vcvt_f64_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVT_I:%.*]] = fpext <2 x float> [[TMP1]] to <2 x double>
+// CHECK:   ret <2 x double> [[VCVT_I]]
 float64x2_t test_vcvt_f64_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vcvt_f64_f32
   return vcvt_f64_f32(a);
-  // CHECK: fcvtl v{{[0-9]+}}.2d, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vcvt_high_f64_f32(<4 x float> %a) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVT_I_I:%.*]] = fpext <2 x float> [[TMP1]] to <2 x double>
+// CHECK:   ret <2 x double> [[VCVT_I_I]]
 float64x2_t test_vcvt_high_f64_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vcvt_high_f64_f32
   return vcvt_high_f64_f32(a);
-  // CHECK: fcvtl2 v{{[0-9]+}}.2d, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x float> @test_vrndn_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRNDN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRNDN1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frintn.v2f32(<2 x float> [[VRNDN_I]]) #2
+// CHECK:   ret <2 x float> [[VRNDN1_I]]
 float32x2_t test_vrndn_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vrndn_f32
   return vrndn_f32(a);
-  // CHECK: frintn v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vrndnq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRNDN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRNDN1_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frintn.v4f32(<4 x float> [[VRNDN_I]]) #2
+// CHECK:   ret <4 x float> [[VRNDN1_I]]
 float32x4_t test_vrndnq_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vrndnq_f32
   return vrndnq_f32(a);
-  // CHECK: frintn v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vrndnq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VRNDN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VRNDN1_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frintn.v2f64(<2 x double> [[VRNDN_I]]) #2
+// CHECK:   ret <2 x double> [[VRNDN1_I]]
 float64x2_t test_vrndnq_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vrndnq_f64
   return vrndnq_f64(a);
-  // CHECK: frintn v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x float> @test_vrnda_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRNDA_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRNDA1_I:%.*]] = call <2 x float> @llvm.round.v2f32(<2 x float> [[VRNDA_I]]) #2
+// CHECK:   ret <2 x float> [[VRNDA1_I]]
 float32x2_t test_vrnda_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vrnda_f32
   return vrnda_f32(a);
-  // CHECK: frinta v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vrndaq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRNDA_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRNDA1_I:%.*]] = call <4 x float> @llvm.round.v4f32(<4 x float> [[VRNDA_I]]) #2
+// CHECK:   ret <4 x float> [[VRNDA1_I]]
 float32x4_t test_vrndaq_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vrndaq_f32
   return vrndaq_f32(a);
-  // CHECK: frinta v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vrndaq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VRNDA_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VRNDA1_I:%.*]] = call <2 x double> @llvm.round.v2f64(<2 x double> [[VRNDA_I]]) #2
+// CHECK:   ret <2 x double> [[VRNDA1_I]]
 float64x2_t test_vrndaq_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vrndaq_f64
   return vrndaq_f64(a);
-  // CHECK: frinta v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x float> @test_vrndp_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRNDP_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRNDP1_I:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[VRNDP_I]]) #2
+// CHECK:   ret <2 x float> [[VRNDP1_I]]
 float32x2_t test_vrndp_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vrndp_f32
   return vrndp_f32(a);
-  // CHECK: frintp v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vrndpq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRNDP_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRNDP1_I:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[VRNDP_I]]) #2
+// CHECK:   ret <4 x float> [[VRNDP1_I]]
 float32x4_t test_vrndpq_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vrndpq_f32
   return vrndpq_f32(a);
-  // CHECK: frintp v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vrndpq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VRNDP_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VRNDP1_I:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[VRNDP_I]]) #2
+// CHECK:   ret <2 x double> [[VRNDP1_I]]
 float64x2_t test_vrndpq_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vrndpq_f64
   return vrndpq_f64(a);
-  // CHECK: frintp v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x float> @test_vrndm_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRNDM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRNDM1_I:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[VRNDM_I]]) #2
+// CHECK:   ret <2 x float> [[VRNDM1_I]]
 float32x2_t test_vrndm_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vrndm_f32
   return vrndm_f32(a);
-  // CHECK: frintm v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vrndmq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRNDM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRNDM1_I:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[VRNDM_I]]) #2
+// CHECK:   ret <4 x float> [[VRNDM1_I]]
 float32x4_t test_vrndmq_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vrndmq_f32
   return vrndmq_f32(a);
-  // CHECK: frintm v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vrndmq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VRNDM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VRNDM1_I:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[VRNDM_I]]) #2
+// CHECK:   ret <2 x double> [[VRNDM1_I]]
 float64x2_t test_vrndmq_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vrndmq_f64
   return vrndmq_f64(a);
-  // CHECK: frintm v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x float> @test_vrndx_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRNDX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRNDX1_I:%.*]] = call <2 x float> @llvm.rint.v2f32(<2 x float> [[VRNDX_I]]) #2
+// CHECK:   ret <2 x float> [[VRNDX1_I]]
 float32x2_t test_vrndx_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vrndx_f32
   return vrndx_f32(a);
-  // CHECK: frintx v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vrndxq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRNDX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRNDX1_I:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[VRNDX_I]]) #2
+// CHECK:   ret <4 x float> [[VRNDX1_I]]
 float32x4_t test_vrndxq_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vrndxq_f32
   return vrndxq_f32(a);
-  // CHECK: frintx v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vrndxq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VRNDX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VRNDX1_I:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[VRNDX_I]]) #2
+// CHECK:   ret <2 x double> [[VRNDX1_I]]
 float64x2_t test_vrndxq_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vrndxq_f64
   return vrndxq_f64(a);
-  // CHECK: frintx v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x float> @test_vrnd_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRNDZ_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRNDZ1_I:%.*]] = call <2 x float> @llvm.trunc.v2f32(<2 x float> [[VRNDZ_I]]) #2
+// CHECK:   ret <2 x float> [[VRNDZ1_I]]
 float32x2_t test_vrnd_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vrnd_f32
   return vrnd_f32(a);
-  // CHECK: frintz v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vrndq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRNDZ_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRNDZ1_I:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[VRNDZ_I]]) #2
+// CHECK:   ret <4 x float> [[VRNDZ1_I]]
 float32x4_t test_vrndq_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vrndq_f32
   return vrndq_f32(a);
-  // CHECK: frintz v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vrndq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VRNDZ_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VRNDZ1_I:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[VRNDZ_I]]) #2
+// CHECK:   ret <2 x double> [[VRNDZ1_I]]
 float64x2_t test_vrndq_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vrndq_f64
   return vrndq_f64(a);
-  // CHECK: frintz v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x float> @test_vrndi_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRNDI_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRNDI1_I:%.*]] = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> [[VRNDI_I]]) #2
+// CHECK:   ret <2 x float> [[VRNDI1_I]]
 float32x2_t test_vrndi_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vrndi_f32
   return vrndi_f32(a);
-  // CHECK: frinti v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vrndiq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRNDI_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRNDI1_I:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[VRNDI_I]]) #2
+// CHECK:   ret <4 x float> [[VRNDI1_I]]
 float32x4_t test_vrndiq_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vrndiq_f32
   return vrndiq_f32(a);
-  // CHECK: frinti v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vrndiq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VRNDI_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VRNDI1_I:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[VRNDI_I]]) #2
+// CHECK:   ret <2 x double> [[VRNDI1_I]]
 float64x2_t test_vrndiq_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vrndiq_f64
   return vrndiq_f64(a);
-  // CHECK: frinti v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvt_s32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP2:%.*]] = fptosi <2 x float> [[TMP1]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vcvt_s32_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vcvt_s32_f32
   return vcvt_s32_f32(a);
-  // CHECK: fcvtzs v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtq_s32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vcvtq_s32_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vcvtq_s32_f32
   return vcvtq_s32_f32(a);
-  // CHECK: fcvtzs v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcvtq_s64_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[TMP2:%.*]] = fptosi <2 x double> [[TMP1]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vcvtq_s64_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vcvtq_s64_f64
   return vcvtq_s64_f64(a);
-  // CHECK: fcvtzs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvt_u32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP2:%.*]] = fptoui <2 x float> [[TMP1]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vcvt_u32_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vcvt_u32_f32
   return vcvt_u32_f32(a);
-  // CHECK: fcvtzu v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtq_u32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP2:%.*]] = fptoui <4 x float> [[TMP1]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vcvtq_u32_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vcvtq_u32_f32
   return vcvtq_u32_f32(a);
-  // CHECK: fcvtzu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcvtq_u64_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[TMP2:%.*]] = fptoui <2 x double> [[TMP1]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 uint64x2_t test_vcvtq_u64_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vcvtq_u64_f64
   return vcvtq_u64_f64(a);
-  // CHECK: fcvtzu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvtn_s32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVTN1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtns.v2i32.v2f32(<2 x float> [[VCVTN_I]]) #2
+// CHECK:   ret <2 x i32> [[VCVTN1_I]]
 int32x2_t test_vcvtn_s32_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vcvtn_s32_f32
   return vcvtn_s32_f32(a);
-  // CHECK: fcvtns v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtnq_s32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVTN1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtns.v4i32.v4f32(<4 x float> [[VCVTN_I]]) #2
+// CHECK:   ret <4 x i32> [[VCVTN1_I]]
 int32x4_t test_vcvtnq_s32_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vcvtnq_s32_f32
   return vcvtnq_s32_f32(a);
-  // CHECK: fcvtns v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcvtnq_s64_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VCVTN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VCVTN1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtns.v2i64.v2f64(<2 x double> [[VCVTN_I]]) #2
+// CHECK:   ret <2 x i64> [[VCVTN1_I]]
 int64x2_t test_vcvtnq_s64_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vcvtnq_s64_f64
   return vcvtnq_s64_f64(a);
-  // CHECK: fcvtns v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvtn_u32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVTN1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtnu.v2i32.v2f32(<2 x float> [[VCVTN_I]]) #2
+// CHECK:   ret <2 x i32> [[VCVTN1_I]]
 uint32x2_t test_vcvtn_u32_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vcvtn_u32_f32
   return vcvtn_u32_f32(a);
-  // CHECK: fcvtnu v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtnq_u32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVTN1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtnu.v4i32.v4f32(<4 x float> [[VCVTN_I]]) #2
+// CHECK:   ret <4 x i32> [[VCVTN1_I]]
 uint32x4_t test_vcvtnq_u32_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vcvtnq_u32_f32
   return vcvtnq_u32_f32(a);
-  // CHECK: fcvtnu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcvtnq_u64_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VCVTN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VCVTN1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtnu.v2i64.v2f64(<2 x double> [[VCVTN_I]]) #2
+// CHECK:   ret <2 x i64> [[VCVTN1_I]]
 uint64x2_t test_vcvtnq_u64_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vcvtnq_u64_f64
   return vcvtnq_u64_f64(a);
-  // CHECK: fcvtnu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvtp_s32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTP_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVTP1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtps.v2i32.v2f32(<2 x float> [[VCVTP_I]]) #2
+// CHECK:   ret <2 x i32> [[VCVTP1_I]]
 int32x2_t test_vcvtp_s32_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vcvtp_s32_f32
   return vcvtp_s32_f32(a);
-  // CHECK: fcvtps v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtpq_s32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTP_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVTP1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtps.v4i32.v4f32(<4 x float> [[VCVTP_I]]) #2
+// CHECK:   ret <4 x i32> [[VCVTP1_I]]
 int32x4_t test_vcvtpq_s32_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vcvtpq_s32_f32
   return vcvtpq_s32_f32(a);
-  // CHECK: fcvtps v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcvtpq_s64_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VCVTP_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VCVTP1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtps.v2i64.v2f64(<2 x double> [[VCVTP_I]]) #2
+// CHECK:   ret <2 x i64> [[VCVTP1_I]]
 int64x2_t test_vcvtpq_s64_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vcvtpq_s64_f64
   return vcvtpq_s64_f64(a);
-  // CHECK: fcvtps v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvtp_u32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTP_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVTP1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtpu.v2i32.v2f32(<2 x float> [[VCVTP_I]]) #2
+// CHECK:   ret <2 x i32> [[VCVTP1_I]]
 uint32x2_t test_vcvtp_u32_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vcvtp_u32_f32
   return vcvtp_u32_f32(a);
-  // CHECK: fcvtpu v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtpq_u32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTP_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVTP1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtpu.v4i32.v4f32(<4 x float> [[VCVTP_I]]) #2
+// CHECK:   ret <4 x i32> [[VCVTP1_I]]
 uint32x4_t test_vcvtpq_u32_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vcvtpq_u32_f32
   return vcvtpq_u32_f32(a);
-  // CHECK: fcvtpu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcvtpq_u64_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VCVTP_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VCVTP1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtpu.v2i64.v2f64(<2 x double> [[VCVTP_I]]) #2
+// CHECK:   ret <2 x i64> [[VCVTP1_I]]
 uint64x2_t test_vcvtpq_u64_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vcvtpq_u64_f64
   return vcvtpq_u64_f64(a);
-  // CHECK: fcvtpu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvtm_s32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVTM1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtms.v2i32.v2f32(<2 x float> [[VCVTM_I]]) #2
+// CHECK:   ret <2 x i32> [[VCVTM1_I]]
 int32x2_t test_vcvtm_s32_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vcvtm_s32_f32
   return vcvtm_s32_f32(a);
-  // CHECK: fcvtms v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtmq_s32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVTM1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtms.v4i32.v4f32(<4 x float> [[VCVTM_I]]) #2
+// CHECK:   ret <4 x i32> [[VCVTM1_I]]
 int32x4_t test_vcvtmq_s32_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vcvtmq_s32_f32
   return vcvtmq_s32_f32(a);
-  // CHECK: fcvtms v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcvtmq_s64_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VCVTM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VCVTM1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtms.v2i64.v2f64(<2 x double> [[VCVTM_I]]) #2
+// CHECK:   ret <2 x i64> [[VCVTM1_I]]
 int64x2_t test_vcvtmq_s64_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vcvtmq_s64_f64
   return vcvtmq_s64_f64(a);
-  // CHECK: fcvtms v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvtm_u32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVTM1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtmu.v2i32.v2f32(<2 x float> [[VCVTM_I]]) #2
+// CHECK:   ret <2 x i32> [[VCVTM1_I]]
 uint32x2_t test_vcvtm_u32_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vcvtm_u32_f32
   return vcvtm_u32_f32(a);
-  // CHECK: fcvtmu v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtmq_u32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVTM1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtmu.v4i32.v4f32(<4 x float> [[VCVTM_I]]) #2
+// CHECK:   ret <4 x i32> [[VCVTM1_I]]
 uint32x4_t test_vcvtmq_u32_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vcvtmq_u32_f32
   return vcvtmq_u32_f32(a);
-  // CHECK: fcvtmu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcvtmq_u64_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VCVTM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VCVTM1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtmu.v2i64.v2f64(<2 x double> [[VCVTM_I]]) #2
+// CHECK:   ret <2 x i64> [[VCVTM1_I]]
 uint64x2_t test_vcvtmq_u64_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vcvtmq_u64_f64
   return vcvtmq_u64_f64(a);
-  // CHECK: fcvtmu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvta_s32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTA_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVTA1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtas.v2i32.v2f32(<2 x float> [[VCVTA_I]]) #2
+// CHECK:   ret <2 x i32> [[VCVTA1_I]]
 int32x2_t test_vcvta_s32_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vcvta_s32_f32
   return vcvta_s32_f32(a);
-  // CHECK: fcvtas v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtaq_s32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTA_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVTA1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtas.v4i32.v4f32(<4 x float> [[VCVTA_I]]) #2
+// CHECK:   ret <4 x i32> [[VCVTA1_I]]
 int32x4_t test_vcvtaq_s32_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vcvtaq_s32_f32
   return vcvtaq_s32_f32(a);
-  // CHECK: fcvtas v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcvtaq_s64_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VCVTA_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VCVTA1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtas.v2i64.v2f64(<2 x double> [[VCVTA_I]]) #2
+// CHECK:   ret <2 x i64> [[VCVTA1_I]]
 int64x2_t test_vcvtaq_s64_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vcvtaq_s64_f64
   return vcvtaq_s64_f64(a);
-  // CHECK: fcvtas v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvta_u32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTA_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVTA1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtau.v2i32.v2f32(<2 x float> [[VCVTA_I]]) #2
+// CHECK:   ret <2 x i32> [[VCVTA1_I]]
 uint32x2_t test_vcvta_u32_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vcvta_u32_f32
   return vcvta_u32_f32(a);
-  // CHECK: fcvtau v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtaq_u32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTA_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVTA1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtau.v4i32.v4f32(<4 x float> [[VCVTA_I]]) #2
+// CHECK:   ret <4 x i32> [[VCVTA1_I]]
 uint32x4_t test_vcvtaq_u32_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vcvtaq_u32_f32
   return vcvtaq_u32_f32(a);
-  // CHECK: fcvtau v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcvtaq_u64_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VCVTA_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VCVTA1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtau.v2i64.v2f64(<2 x double> [[VCVTA_I]]) #2
+// CHECK:   ret <2 x i64> [[VCVTA1_I]]
 uint64x2_t test_vcvtaq_u64_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vcvtaq_u64_f64
   return vcvtaq_u64_f64(a);
-  // CHECK: fcvtau v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x float> @test_vrsqrte_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRSQRTE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRSQRTE_V1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frsqrte.v2f32(<2 x float> [[VRSQRTE_V_I]]) #2
+// CHECK:   ret <2 x float> [[VRSQRTE_V1_I]]
 float32x2_t test_vrsqrte_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vrsqrte_f32
   return vrsqrte_f32(a);
-  // CHECK: frsqrte v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vrsqrteq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRSQRTEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRSQRTEQ_V1_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frsqrte.v4f32(<4 x float> [[VRSQRTEQ_V_I]]) #2
+// CHECK:   ret <4 x float> [[VRSQRTEQ_V1_I]]
 float32x4_t test_vrsqrteq_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vrsqrteq_f32
   return vrsqrteq_f32(a);
-  // CHECK: frsqrte v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vrsqrteq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VRSQRTEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VRSQRTEQ_V1_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frsqrte.v2f64(<2 x double> [[VRSQRTEQ_V_I]]) #2
+// CHECK:   ret <2 x double> [[VRSQRTEQ_V1_I]]
 float64x2_t test_vrsqrteq_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vrsqrteq_f64
   return vrsqrteq_f64(a);
-  // CHECK: frsqrte v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x float> @test_vrecpe_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRECPE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRECPE_V1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frecpe.v2f32(<2 x float> [[VRECPE_V_I]]) #2
+// CHECK:   ret <2 x float> [[VRECPE_V1_I]]
 float32x2_t test_vrecpe_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vrecpe_f32
   return vrecpe_f32(a);
-  // CHECK: frecpe v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vrecpeq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRECPEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRECPEQ_V1_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frecpe.v4f32(<4 x float> [[VRECPEQ_V_I]]) #2
+// CHECK:   ret <4 x float> [[VRECPEQ_V1_I]]
 float32x4_t test_vrecpeq_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vrecpeq_f32
   return vrecpeq_f32(a);
-  // CHECK: frecpe v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vrecpeq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VRECPEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VRECPEQ_V1_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frecpe.v2f64(<2 x double> [[VRECPEQ_V_I]]) #2
+// CHECK:   ret <2 x double> [[VRECPEQ_V1_I]]
 float64x2_t test_vrecpeq_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vrecpeq_f64
   return vrecpeq_f64(a);
-  // CHECK: frecpe v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vrecpe_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VRECPE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VRECPE_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.urecpe.v2i32(<2 x i32> [[VRECPE_V_I]]) #2
+// CHECK:   ret <2 x i32> [[VRECPE_V1_I]]
 uint32x2_t test_vrecpe_u32(uint32x2_t a) {
-  //CHECK-LABEL: test_vrecpe_u32
   return vrecpe_u32(a);
-  // CHECK: urecpe v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vrecpeq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VRECPEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRECPEQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.urecpe.v4i32(<4 x i32> [[VRECPEQ_V_I]]) #2
+// CHECK:   ret <4 x i32> [[VRECPEQ_V1_I]]
 uint32x4_t test_vrecpeq_u32(uint32x4_t a) {
-  //CHECK-LABEL: test_vrecpeq_u32
   return vrecpeq_u32(a);
-  // CHECK: urecpe v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x float> @test_vsqrt_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VSQRT_I:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[TMP1]]) #2
+// CHECK:   ret <2 x float> [[VSQRT_I]]
 float32x2_t test_vsqrt_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vsqrt_f32
   return vsqrt_f32(a);
-  // CHECK: fsqrt v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vsqrtq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VSQRT_I:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP1]]) #2
+// CHECK:   ret <4 x float> [[VSQRT_I]]
 float32x4_t test_vsqrtq_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vsqrtq_f32
   return vsqrtq_f32(a);
-  // CHECK: fsqrt v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vsqrtq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VSQRT_I:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP1]]) #2
+// CHECK:   ret <2 x double> [[VSQRT_I]]
 float64x2_t test_vsqrtq_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vsqrtq_f64
   return vsqrtq_f64(a);
-  // CHECK: fsqrt v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x float> @test_vcvt_f32_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VCVT_I:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float>
+// CHECK:   ret <2 x float> [[VCVT_I]]
 float32x2_t test_vcvt_f32_s32(int32x2_t a) {
-  //CHECK-LABEL: test_vcvt_f32_s32
   return vcvt_f32_s32(a);
-  //CHECK: scvtf v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <2 x float> @test_vcvt_f32_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VCVT_I:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x float>
+// CHECK:   ret <2 x float> [[VCVT_I]]
 float32x2_t test_vcvt_f32_u32(uint32x2_t a) {
-  //CHECK-LABEL: test_vcvt_f32_u32
   return vcvt_f32_u32(a);
-  //CHECK: ucvtf v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vcvtq_f32_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VCVT_I:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
+// CHECK:   ret <4 x float> [[VCVT_I]]
 float32x4_t test_vcvtq_f32_s32(int32x4_t a) {
-  //CHECK-LABEL: test_vcvtq_f32_s32
   return vcvtq_f32_s32(a);
-  //CHECK: scvtf v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vcvtq_f32_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VCVT_I:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x float>
+// CHECK:   ret <4 x float> [[VCVT_I]]
 float32x4_t test_vcvtq_f32_u32(uint32x4_t a) {
-  //CHECK-LABEL: test_vcvtq_f32_u32
   return vcvtq_f32_u32(a);
-  //CHECK: ucvtf v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vcvtq_f64_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VCVT_I:%.*]] = sitofp <2 x i64> [[TMP1]] to <2 x double>
+// CHECK:   ret <2 x double> [[VCVT_I]]
 float64x2_t test_vcvtq_f64_s64(int64x2_t a) {
-  //CHECK-LABEL: test_vcvtq_f64_s64
   return vcvtq_f64_s64(a);
-  //CHECK: scvtf v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x double> @test_vcvtq_f64_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VCVT_I:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x double>
+// CHECK:   ret <2 x double> [[VCVT_I]]
 float64x2_t test_vcvtq_f64_u64(uint64x2_t a) {
-  //CHECK-LABEL: test_vcvtq_f64_u64
   return vcvtq_f64_u64(a);
-  //CHECK: ucvtf v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }

Modified: cfe/trunk/test/CodeGen/aarch64-neon-perm.c
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/CodeGen/aarch64-neon-perm.c?rev=263048&r1=263047&r2=263048&view=diff
==============================================================================
--- cfe/trunk/test/CodeGen/aarch64-neon-perm.c (original)
+++ cfe/trunk/test/CodeGen/aarch64-neon-perm.c Wed Mar  9 12:54:42 2016
@@ -1,1092 +1,2279 @@
-// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+// RUN:  -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 // Test new aarch64 intrinsics and types
 #include <arm_neon.h>
 
+// CHECK-LABEL: define <8 x i8> @test_vuzp1_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vuzp1_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vuzp1_s8
   return vuzp1_s8(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vuzp1q_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vuzp1q_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vuzp1q_s8
   return vuzp1q_s8(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vuzp1_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vuzp1_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vuzp1_s16
   return vuzp1_s16(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vuzp1q_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vuzp1q_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vuzp1q_s16
   return vuzp1q_s16(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vuzp1_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vuzp1_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vuzp1_s32
   return vuzp1_s32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vuzp1q_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vuzp1q_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vuzp1q_s32
   return vuzp1q_s32(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vuzp1q_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 int64x2_t test_vuzp1q_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vuzp1q_s64
   return vuzp1q_s64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[1\], v[0-9]+.d\[0\]|zip1 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vuzp1_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vuzp1_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vuzp1_u8
   return vuzp1_u8(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vuzp1q_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vuzp1q_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vuzp1q_u8
   return vuzp1q_u8(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vuzp1_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vuzp1_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vuzp1_u16
   return vuzp1_u16(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vuzp1q_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vuzp1q_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vuzp1q_u16
   return vuzp1q_u16(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vuzp1_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vuzp1_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vuzp1_u32
   return vuzp1_u32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vuzp1q_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vuzp1q_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vuzp1q_u32
   return vuzp1q_u32(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vuzp1q_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 uint64x2_t test_vuzp1q_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vuzp1q_u64
   return vuzp1q_u64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[1\], v[0-9]+.d\[0\]|zip1 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <2 x float> @test_vuzp1_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vuzp1_f32(float32x2_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vuzp1_f32
   return vuzp1_f32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x float> @test_vuzp1q_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   ret <4 x float> [[SHUFFLE_I]]
 float32x4_t test_vuzp1q_f32(float32x4_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vuzp1q_f32
   return vuzp1q_f32(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vuzp1q_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x double> [[SHUFFLE_I]]
 float64x2_t test_vuzp1q_f64(float64x2_t a, float64x2_t b) {
-  // CHECK-LABEL: test_vuzp1q_f64
   return vuzp1q_f64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[1\], v[0-9]+.d\[0\]|zip1 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vuzp1_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vuzp1_p8(poly8x8_t a, poly8x8_t b) {
-  // CHECK-LABEL: test_vuzp1_p8
   return vuzp1_p8(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vuzp1q_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vuzp1q_p8(poly8x16_t a, poly8x16_t b) {
-  // CHECK-LABEL: test_vuzp1q_p8
   return vuzp1q_p8(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vuzp1_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vuzp1_p16(poly16x4_t a, poly16x4_t b) {
-  // CHECK-LABEL: test_vuzp1_p16
   return vuzp1_p16(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vuzp1q_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vuzp1q_p16(poly16x8_t a, poly16x8_t b) {
-  // CHECK-LABEL: test_vuzp1q_p16
   return vuzp1q_p16(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vuzp2_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vuzp2_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vuzp2_s8
   return vuzp2_s8(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vuzp2q_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vuzp2q_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vuzp2q_s8
   return vuzp2q_s8(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vuzp2_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vuzp2_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vuzp2_s16
   return vuzp2_s16(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vuzp2q_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vuzp2q_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vuzp2q_s16
   return vuzp2q_s16(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vuzp2_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vuzp2_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vuzp2_s32
   return vuzp2_s32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vuzp2q_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vuzp2q_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vuzp2q_s32
   return vuzp2q_s32(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vuzp2q_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 int64x2_t test_vuzp2q_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vuzp2q_s64
   return vuzp2q_s64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[0\], v[0-9]+.d\[1\]|zip2 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vuzp2_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vuzp2_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vuzp2_u8
   return vuzp2_u8(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vuzp2q_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vuzp2q_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vuzp2q_u8
   return vuzp2q_u8(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vuzp2_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vuzp2_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vuzp2_u16
   return vuzp2_u16(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vuzp2q_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vuzp2q_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vuzp2q_u16
   return vuzp2q_u16(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vuzp2_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vuzp2_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vuzp2_u32
   return vuzp2_u32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vuzp2q_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vuzp2q_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vuzp2q_u32
   return vuzp2q_u32(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vuzp2q_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 uint64x2_t test_vuzp2q_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vuzp2q_u64
   return vuzp2q_u64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[0\], v[0-9]+.d\[1\]|zip2 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <2 x float> @test_vuzp2_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vuzp2_f32(float32x2_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vuzp2_f32
   return vuzp2_f32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x float> @test_vuzp2q_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   ret <4 x float> [[SHUFFLE_I]]
 float32x4_t test_vuzp2q_f32(float32x4_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vuzp2q_f32
   return vuzp2q_f32(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vuzp2q_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x double> [[SHUFFLE_I]]
 float64x2_t test_vuzp2q_f64(float64x2_t a, float64x2_t b) {
-  // CHECK-LABEL: test_vuzp2q_f64
   return vuzp2q_f64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[0\], v[0-9]+.d\[1\]|zip2 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vuzp2_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vuzp2_p8(poly8x8_t a, poly8x8_t b) {
-  // CHECK-LABEL: test_vuzp2_p8
   return vuzp2_p8(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vuzp2q_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vuzp2q_p8(poly8x16_t a, poly8x16_t b) {
-  // CHECK-LABEL: test_vuzp2q_p8
   return vuzp2q_p8(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vuzp2_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vuzp2_p16(poly16x4_t a, poly16x4_t b) {
-  // CHECK-LABEL: test_vuzp2_p16
   return vuzp2_p16(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vuzp2q_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vuzp2q_p16(poly16x8_t a, poly16x8_t b) {
-  // CHECK-LABEL: test_vuzp2q_p16
   return vuzp2q_p16(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vzip1_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vzip1_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vzip1_s8
   return vzip1_s8(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vzip1q_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vzip1q_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vzip1q_s8
   return vzip1q_s8(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vzip1_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vzip1_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vzip1_s16
   return vzip1_s16(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vzip1q_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vzip1q_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vzip1q_s16
   return vzip1q_s16(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vzip1_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vzip1_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vzip1_s32
   return vzip1_s32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vzip1q_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vzip1q_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vzip1q_s32
   return vzip1q_s32(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vzip1q_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 int64x2_t test_vzip1q_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vzip1q_s64
   return vzip1q_s64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[1\], v[0-9]+.d\[0\]|zip1 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vzip1_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vzip1_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vzip1_u8
   return vzip1_u8(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vzip1q_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vzip1q_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vzip1q_u8
   return vzip1q_u8(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vzip1_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vzip1_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vzip1_u16
   return vzip1_u16(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vzip1q_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vzip1q_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vzip1q_u16
   return vzip1q_u16(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vzip1_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vzip1_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vzip1_u32
   return vzip1_u32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vzip1q_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vzip1q_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vzip1q_u32
   return vzip1q_u32(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vzip1q_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 uint64x2_t test_vzip1q_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vzip1q_u64
   return vzip1q_u64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[1\], v[0-9]+.d\[0\]|zip1 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <2 x float> @test_vzip1_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vzip1_f32(float32x2_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vzip1_f32
   return vzip1_f32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x float> @test_vzip1q_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   ret <4 x float> [[SHUFFLE_I]]
 float32x4_t test_vzip1q_f32(float32x4_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vzip1q_f32
   return vzip1q_f32(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vzip1q_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x double> [[SHUFFLE_I]]
 float64x2_t test_vzip1q_f64(float64x2_t a, float64x2_t b) {
-  // CHECK-LABEL: test_vzip1q_f64
   return vzip1q_f64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[1\], v[0-9]+.d\[0\]|zip1 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vzip1_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vzip1_p8(poly8x8_t a, poly8x8_t b) {
-  // CHECK-LABEL: test_vzip1_p8
   return vzip1_p8(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vzip1q_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vzip1q_p8(poly8x16_t a, poly8x16_t b) {
-  // CHECK-LABEL: test_vzip1q_p8
   return vzip1q_p8(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vzip1_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vzip1_p16(poly16x4_t a, poly16x4_t b) {
-  // CHECK-LABEL: test_vzip1_p16
   return vzip1_p16(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vzip1q_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vzip1q_p16(poly16x8_t a, poly16x8_t b) {
-  // CHECK-LABEL: test_vzip1q_p16
   return vzip1q_p16(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vzip2_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vzip2_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vzip2_s8
   return vzip2_s8(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vzip2q_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vzip2q_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vzip2q_s8
   return vzip2q_s8(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vzip2_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vzip2_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vzip2_s16
   return vzip2_s16(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vzip2q_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vzip2q_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vzip2q_s16
   return vzip2q_s16(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vzip2_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vzip2_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vzip2_s32
   return vzip2_s32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vzip2q_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vzip2q_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vzip2q_s32
   return vzip2q_s32(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vzip2q_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 int64x2_t test_vzip2q_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vzip2q_s64
   return vzip2q_s64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[0\], v[0-9]+.d\[1\]|zip2 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vzip2_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vzip2_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vzip2_u8
   return vzip2_u8(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vzip2q_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vzip2q_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vzip2q_u8
   return vzip2q_u8(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vzip2_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vzip2_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vzip2_u16
   return vzip2_u16(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vzip2q_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vzip2q_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vzip2q_u16
   return vzip2q_u16(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vzip2_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vzip2_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vzip2_u32
   return vzip2_u32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vzip2q_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vzip2q_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vzip2q_u32
   return vzip2q_u32(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vzip2q_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 uint64x2_t test_vzip2q_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vzip2q_u64
   return vzip2q_u64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[0\], v[0-9]+.d\[1\]|zip2 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <2 x float> @test_vzip2_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vzip2_f32(float32x2_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vzip2_f32
   return vzip2_f32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x float> @test_vzip2q_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   ret <4 x float> [[SHUFFLE_I]]
 float32x4_t test_vzip2q_f32(float32x4_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vzip2q_f32
   return vzip2q_f32(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vzip2q_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x double> [[SHUFFLE_I]]
 float64x2_t test_vzip2q_f64(float64x2_t a, float64x2_t b) {
-  // CHECK-LABEL: test_vzip2q_f64
   return vzip2q_f64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[0\], v[0-9]+.d\[1\]|zip2 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vzip2_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vzip2_p8(poly8x8_t a, poly8x8_t b) {
-  // CHECK-LABEL: test_vzip2_p8
   return vzip2_p8(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vzip2q_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vzip2q_p8(poly8x16_t a, poly8x16_t b) {
-  // CHECK-LABEL: test_vzip2q_p8
   return vzip2q_p8(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vzip2_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vzip2_p16(poly16x4_t a, poly16x4_t b) {
-  // CHECK-LABEL: test_vzip2_p16
   return vzip2_p16(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vzip2q_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vzip2q_p16(poly16x8_t a, poly16x8_t b) {
-  // CHECK-LABEL: test_vzip2q_p16
   return vzip2q_p16(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtrn1_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vtrn1_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vtrn1_s8
   return vtrn1_s8(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vtrn1q_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vtrn1q_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vtrn1q_s8
   return vtrn1q_s8(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vtrn1_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vtrn1_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vtrn1_s16
   return vtrn1_s16(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vtrn1q_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vtrn1q_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vtrn1q_s16
   return vtrn1q_s16(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vtrn1_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vtrn1_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vtrn1_s32
   return vtrn1_s32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vtrn1q_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vtrn1q_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vtrn1q_s32
   return vtrn1q_s32(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vtrn1q_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 int64x2_t test_vtrn1q_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vtrn1q_s64
   return vtrn1q_s64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[1\], v[0-9]+.d\[0\]|zip1 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtrn1_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vtrn1_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vtrn1_u8
   return vtrn1_u8(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vtrn1q_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vtrn1q_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vtrn1q_u8
   return vtrn1q_u8(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vtrn1_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vtrn1_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vtrn1_u16
   return vtrn1_u16(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vtrn1q_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vtrn1q_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vtrn1q_u16
   return vtrn1q_u16(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vtrn1_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vtrn1_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vtrn1_u32
   return vtrn1_u32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vtrn1q_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vtrn1q_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vtrn1q_u32
   return vtrn1q_u32(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vtrn1q_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 uint64x2_t test_vtrn1q_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vtrn1q_u64
   return vtrn1q_u64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[1\], v[0-9]+.d\[0\]|zip1 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <2 x float> @test_vtrn1_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vtrn1_f32(float32x2_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vtrn1_f32
   return vtrn1_f32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x float> @test_vtrn1q_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   ret <4 x float> [[SHUFFLE_I]]
 float32x4_t test_vtrn1q_f32(float32x4_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vtrn1q_f32
   return vtrn1q_f32(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vtrn1q_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x double> [[SHUFFLE_I]]
 float64x2_t test_vtrn1q_f64(float64x2_t a, float64x2_t b) {
-  // CHECK-LABEL: test_vtrn1q_f64
   return vtrn1q_f64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[1\], v[0-9]+.d\[0\]|zip1 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtrn1_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vtrn1_p8(poly8x8_t a, poly8x8_t b) {
-  // CHECK-LABEL: test_vtrn1_p8
   return vtrn1_p8(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vtrn1q_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vtrn1q_p8(poly8x16_t a, poly8x16_t b) {
-  // CHECK-LABEL: test_vtrn1q_p8
   return vtrn1q_p8(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vtrn1_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vtrn1_p16(poly16x4_t a, poly16x4_t b) {
-  // CHECK-LABEL: test_vtrn1_p16
   return vtrn1_p16(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vtrn1q_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vtrn1q_p16(poly16x8_t a, poly16x8_t b) {
-  // CHECK-LABEL: test_vtrn1q_p16
   return vtrn1q_p16(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtrn2_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vtrn2_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vtrn2_s8
   return vtrn2_s8(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vtrn2q_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vtrn2q_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vtrn2q_s8
   return vtrn2q_s8(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vtrn2_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vtrn2_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vtrn2_s16
   return vtrn2_s16(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vtrn2q_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vtrn2q_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vtrn2q_s16
   return vtrn2q_s16(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vtrn2_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vtrn2_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vtrn2_s32
   return vtrn2_s32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vtrn2q_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vtrn2q_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vtrn2q_s32
   return vtrn2q_s32(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vtrn2q_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 int64x2_t test_vtrn2q_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vtrn2q_s64
   return vtrn2q_s64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[0\], v[0-9]+.d\[1\]|zip2 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtrn2_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vtrn2_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vtrn2_u8
   return vtrn2_u8(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vtrn2q_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vtrn2q_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vtrn2q_u8
   return vtrn2q_u8(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vtrn2_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vtrn2_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vtrn2_u16
   return vtrn2_u16(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vtrn2q_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vtrn2q_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vtrn2q_u16
   return vtrn2q_u16(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vtrn2_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vtrn2_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vtrn2_u32
   return vtrn2_u32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vtrn2q_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vtrn2q_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vtrn2q_u32
   return vtrn2q_u32(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vtrn2q_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 uint64x2_t test_vtrn2q_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vtrn2q_u64
   return vtrn2q_u64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[0\], v[0-9]+.d\[1\]|zip2 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <2 x float> @test_vtrn2_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vtrn2_f32(float32x2_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vtrn2_f32
   return vtrn2_f32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x float> @test_vtrn2q_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   ret <4 x float> [[SHUFFLE_I]]
 float32x4_t test_vtrn2q_f32(float32x4_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vtrn2q_f32
   return vtrn2q_f32(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vtrn2q_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x double> [[SHUFFLE_I]]
 float64x2_t test_vtrn2q_f64(float64x2_t a, float64x2_t b) {
-  // CHECK-LABEL: test_vtrn2q_f64
   return vtrn2q_f64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[0\], v[0-9]+.d\[1\]|zip2 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtrn2_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vtrn2_p8(poly8x8_t a, poly8x8_t b) {
-  // CHECK-LABEL: test_vtrn2_p8
   return vtrn2_p8(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vtrn2q_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vtrn2q_p8(poly8x16_t a, poly8x16_t b) {
-  // CHECK-LABEL: test_vtrn2q_p8
   return vtrn2q_p8(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vtrn2_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vtrn2_p16(poly16x4_t a, poly16x4_t b) {
-  // CHECK-LABEL: test_vtrn2_p16
   return vtrn2_p16(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vtrn2q_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vtrn2q_p16(poly16x8_t a, poly16x8_t b) {
-  // CHECK-LABEL: test_vtrn2q_p16
   return vtrn2q_p16(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define %struct.int8x8x2_t @test_vuzp_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.int8x8x2_t [[TMP5]], 0
+// CHECK:   store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8
+// CHECK:   [[TMP8:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x2_t [[TMP8]]
 int8x8x2_t test_vuzp_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vuzp_s8
   return vuzp_s8(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  // CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define %struct.int16x4x2_t @test_vuzp_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int16x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i16>] [[TMP1]]1, [2 x <4 x i16>]* [[TMP1]]0, align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x2_t [[TMP1]]2
 int16x4x2_t test_vuzp_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vuzp_s16
   return vuzp_s16(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-  // CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+// CHECK-LABEL: define %struct.int32x2x2_t @test_vuzp_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x i32> [[VUZP_I]], <2 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x i32> [[VUZP1_I]], <2 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int32x2x2_t [[TMP9]], 0
+// CHECK:   store [2 x <2 x i32>] [[TMP1]]1, [2 x <2 x i32>]* [[TMP1]]0, align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x2_t [[TMP1]]2
 int32x2x2_t test_vuzp_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vuzp_s32
   return vuzp_s32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}}
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}}
 }
+// CHECK-LABEL: define %struct.uint8x8x2_t @test_vuzp_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.uint8x8x2_t [[TMP5]], 0
+// CHECK:   store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8
+// CHECK:   [[TMP8:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x2_t [[TMP8]]
 uint8x8x2_t test_vuzp_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vuzp_u8
   return vuzp_u8(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  // CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+// CHECK-LABEL: define %struct.uint16x4x2_t @test_vuzp_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint16x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i16>] [[TMP1]]1, [2 x <4 x i16>]* [[TMP1]]0, align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x2_t [[TMP1]]2
 uint16x4x2_t test_vuzp_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vuzp_u16
   return vuzp_u16(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-  // CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+// CHECK-LABEL: define %struct.uint32x2x2_t @test_vuzp_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x i32> [[VUZP_I]], <2 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x i32> [[VUZP1_I]], <2 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint32x2x2_t [[TMP9]], 0
+// CHECK:   store [2 x <2 x i32>] [[TMP1]]1, [2 x <2 x i32>]* [[TMP1]]0, align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x2_t [[TMP1]]2
 uint32x2x2_t test_vuzp_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vuzp_u32
   return vuzp_u32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}}
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}}
 }
+// CHECK-LABEL: define %struct.float32x2x2_t @test_vuzp_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x float> [[VUZP_I]], <2 x float>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x float> [[VUZP1_I]], <2 x float>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.float32x2x2_t [[TMP9]], 0
+// CHECK:   store [2 x <2 x float>] [[TMP1]]1, [2 x <2 x float>]* [[TMP1]]0, align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x2_t [[TMP1]]2
 float32x2x2_t test_vuzp_f32(float32x2_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vuzp_f32
   return vuzp_f32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}}
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}}
 }
+// CHECK-LABEL: define %struct.poly8x8x2_t @test_vuzp_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.poly8x8x2_t [[TMP5]], 0
+// CHECK:   store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8
+// CHECK:   [[TMP8:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x2_t [[TMP8]]
 poly8x8x2_t test_vuzp_p8(poly8x8_t a, poly8x8_t b) {
-  // CHECK-LABEL: test_vuzp_p8
   return vuzp_p8(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  // CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+// CHECK-LABEL: define %struct.poly16x4x2_t @test_vuzp_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.poly16x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i16>] [[TMP1]]1, [2 x <4 x i16>]* [[TMP1]]0, align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x2_t [[TMP1]]2
 poly16x4x2_t test_vuzp_p16(poly16x4_t a, poly16x4_t b) {
-  // CHECK-LABEL: test_vuzp_p16
   return vuzp_p16(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-  // CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+// CHECK-LABEL: define %struct.int8x16x2_t @test_vuzpq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+// CHECK:   store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+// CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.int8x16x2_t [[TMP5]], 0
+// CHECK:   store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16
+// CHECK:   [[TMP8:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x2_t [[TMP8]]
 int8x16x2_t test_vuzpq_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vuzpq_s8
   return vuzpq_s8(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-  // CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+// CHECK-LABEL: define %struct.int16x8x2_t @test_vuzpq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int16x8x2_t [[TMP9]], 0
+// CHECK:   store [2 x <8 x i16>] [[TMP1]]1, [2 x <8 x i16>]* [[TMP1]]0, align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x2_t [[TMP1]]2
 int16x8x2_t test_vuzpq_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vuzpq_s16
   return vuzpq_s16(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-  // CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+// CHECK-LABEL: define %struct.int32x4x2_t @test_vuzpq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   store <4 x i32> [[VUZP_I]], <4 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int32x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i32>] [[TMP1]]1, [2 x <4 x i32>]* [[TMP1]]0, align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x2_t [[TMP1]]2
 int32x4x2_t test_vuzpq_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vuzpq_s32
   return vuzpq_s32(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+// CHECK-LABEL: define %struct.uint8x16x2_t @test_vuzpq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+// CHECK:   store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+// CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.uint8x16x2_t [[TMP5]], 0
+// CHECK:   store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16
+// CHECK:   [[TMP8:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x2_t [[TMP8]]
 uint8x16x2_t test_vuzpq_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vuzpq_u8
   return vuzpq_u8(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-  // CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+// CHECK-LABEL: define %struct.uint16x8x2_t @test_vuzpq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint16x8x2_t [[TMP9]], 0
+// CHECK:   store [2 x <8 x i16>] [[TMP1]]1, [2 x <8 x i16>]* [[TMP1]]0, align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x2_t [[TMP1]]2
 uint16x8x2_t test_vuzpq_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vuzpq_u16
   return vuzpq_u16(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-  // CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+// CHECK-LABEL: define %struct.uint32x4x2_t @test_vuzpq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   store <4 x i32> [[VUZP_I]], <4 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint32x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i32>] [[TMP1]]1, [2 x <4 x i32>]* [[TMP1]]0, align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x2_t [[TMP1]]2
 uint32x4x2_t test_vuzpq_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vuzpq_u32
   return vuzpq_u32(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+// CHECK-LABEL: define %struct.float32x4x2_t @test_vuzpq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   store <4 x float> [[VUZP_I]], <4 x float>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   store <4 x float> [[VUZP1_I]], <4 x float>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.float32x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x float>] [[TMP1]]1, [2 x <4 x float>]* [[TMP1]]0, align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x2_t [[TMP1]]2
 float32x4x2_t test_vuzpq_f32(float32x4_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vuzpq_f32
   return vuzpq_f32(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+// CHECK-LABEL: define %struct.poly8x16x2_t @test_vuzpq_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+// CHECK:   store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+// CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.poly8x16x2_t [[TMP5]], 0
+// CHECK:   store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16
+// CHECK:   [[TMP8:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x2_t [[TMP8]]
 poly8x16x2_t test_vuzpq_p8(poly8x16_t a, poly8x16_t b) {
-  // CHECK-LABEL: test_vuzpq_p8
   return vuzpq_p8(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-  // CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+// CHECK-LABEL: define %struct.poly16x8x2_t @test_vuzpq_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.poly16x8x2_t [[TMP9]], 0
+// CHECK:   store [2 x <8 x i16>] [[TMP1]]1, [2 x <8 x i16>]* [[TMP1]]0, align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x2_t [[TMP1]]2
 poly16x8x2_t test_vuzpq_p16(poly16x8_t a, poly16x8_t b) {
-  // CHECK-LABEL: test_vuzpq_p16
   return vuzpq_p16(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-  // CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define %struct.int8x8x2_t @test_vzip_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.int8x8x2_t [[TMP5]], 0
+// CHECK:   store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8
+// CHECK:   [[TMP8:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x2_t [[TMP8]]
 int8x8x2_t test_vzip_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vzip_s8
   return vzip_s8(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  // CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define %struct.int16x4x2_t @test_vzip_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int16x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i16>] [[TMP1]]1, [2 x <4 x i16>]* [[TMP1]]0, align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x2_t [[TMP1]]2
 int16x4x2_t test_vzip_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vzip_s16
   return vzip_s16(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-  // CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+// CHECK-LABEL: define %struct.int32x2x2_t @test_vzip_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x i32> [[VZIP_I]], <2 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x i32> [[VZIP1_I]], <2 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int32x2x2_t [[TMP9]], 0
+// CHECK:   store [2 x <2 x i32>] [[TMP1]]1, [2 x <2 x i32>]* [[TMP1]]0, align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x2_t [[TMP1]]2
 int32x2x2_t test_vzip_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vzip_s32
   return vzip_s32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}}
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}}
 }
+// CHECK-LABEL: define %struct.uint8x8x2_t @test_vzip_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.uint8x8x2_t [[TMP5]], 0
+// CHECK:   store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8
+// CHECK:   [[TMP8:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x2_t [[TMP8]]
 uint8x8x2_t test_vzip_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vzip_u8
   return vzip_u8(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  // CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+// CHECK-LABEL: define %struct.uint16x4x2_t @test_vzip_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint16x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i16>] [[TMP1]]1, [2 x <4 x i16>]* [[TMP1]]0, align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x2_t [[TMP1]]2
 uint16x4x2_t test_vzip_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vzip_u16
   return vzip_u16(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-  // CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+// CHECK-LABEL: define %struct.uint32x2x2_t @test_vzip_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x i32> [[VZIP_I]], <2 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x i32> [[VZIP1_I]], <2 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint32x2x2_t [[TMP9]], 0
+// CHECK:   store [2 x <2 x i32>] [[TMP1]]1, [2 x <2 x i32>]* [[TMP1]]0, align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x2_t [[TMP1]]2
 uint32x2x2_t test_vzip_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vzip_u32
   return vzip_u32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}}
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}}
 }
+// CHECK-LABEL: define %struct.float32x2x2_t @test_vzip_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x float> [[VZIP_I]], <2 x float>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x float> [[VZIP1_I]], <2 x float>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.float32x2x2_t [[TMP9]], 0
+// CHECK:   store [2 x <2 x float>] [[TMP1]]1, [2 x <2 x float>]* [[TMP1]]0, align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x2_t [[TMP1]]2
 float32x2x2_t test_vzip_f32(float32x2_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vzip_f32
   return vzip_f32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}}
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}}
 }
+// CHECK-LABEL: define %struct.poly8x8x2_t @test_vzip_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.poly8x8x2_t [[TMP5]], 0
+// CHECK:   store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8
+// CHECK:   [[TMP8:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x2_t [[TMP8]]
 poly8x8x2_t test_vzip_p8(poly8x8_t a, poly8x8_t b) {
-  // CHECK-LABEL: test_vzip_p8
   return vzip_p8(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  // CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+// CHECK-LABEL: define %struct.poly16x4x2_t @test_vzip_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.poly16x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i16>] [[TMP1]]1, [2 x <4 x i16>]* [[TMP1]]0, align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x2_t [[TMP1]]2
 poly16x4x2_t test_vzip_p16(poly16x4_t a, poly16x4_t b) {
-  // CHECK-LABEL: test_vzip_p16
   return vzip_p16(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-  // CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+// CHECK-LABEL: define %struct.int8x16x2_t @test_vzipq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+// CHECK:   store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+// CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.int8x16x2_t [[TMP5]], 0
+// CHECK:   store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16
+// CHECK:   [[TMP8:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x2_t [[TMP8]]
 int8x16x2_t test_vzipq_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vzipq_s8
   return vzipq_s8(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-  // CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+// CHECK-LABEL: define %struct.int16x8x2_t @test_vzipq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int16x8x2_t [[TMP9]], 0
+// CHECK:   store [2 x <8 x i16>] [[TMP1]]1, [2 x <8 x i16>]* [[TMP1]]0, align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x2_t [[TMP1]]2
 int16x8x2_t test_vzipq_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vzipq_s16
   return vzipq_s16(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-  // CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+// CHECK-LABEL: define %struct.int32x4x2_t @test_vzipq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   store <4 x i32> [[VZIP_I]], <4 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int32x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i32>] [[TMP1]]1, [2 x <4 x i32>]* [[TMP1]]0, align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x2_t [[TMP1]]2
 int32x4x2_t test_vzipq_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vzipq_s32
   return vzipq_s32(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+// CHECK-LABEL: define %struct.uint8x16x2_t @test_vzipq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+// CHECK:   store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+// CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.uint8x16x2_t [[TMP5]], 0
+// CHECK:   store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16
+// CHECK:   [[TMP8:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x2_t [[TMP8]]
 uint8x16x2_t test_vzipq_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vzipq_u8
   return vzipq_u8(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-  // CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+// CHECK-LABEL: define %struct.uint16x8x2_t @test_vzipq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint16x8x2_t [[TMP9]], 0
+// CHECK:   store [2 x <8 x i16>] [[TMP1]]1, [2 x <8 x i16>]* [[TMP1]]0, align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x2_t [[TMP1]]2
 uint16x8x2_t test_vzipq_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vzipq_u16
   return vzipq_u16(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-  // CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+// CHECK-LABEL: define %struct.uint32x4x2_t @test_vzipq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   store <4 x i32> [[VZIP_I]], <4 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint32x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i32>] [[TMP1]]1, [2 x <4 x i32>]* [[TMP1]]0, align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x2_t [[TMP1]]2
 uint32x4x2_t test_vzipq_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vzipq_u32
   return vzipq_u32(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+// CHECK-LABEL: define %struct.float32x4x2_t @test_vzipq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   store <4 x float> [[VZIP_I]], <4 x float>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   store <4 x float> [[VZIP1_I]], <4 x float>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.float32x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x float>] [[TMP1]]1, [2 x <4 x float>]* [[TMP1]]0, align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x2_t [[TMP1]]2
 float32x4x2_t test_vzipq_f32(float32x4_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vzipq_f32
   return vzipq_f32(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+// CHECK-LABEL: define %struct.poly8x16x2_t @test_vzipq_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+// CHECK:   store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+// CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.poly8x16x2_t [[TMP5]], 0
+// CHECK:   store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16
+// CHECK:   [[TMP8:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x2_t [[TMP8]]
 poly8x16x2_t test_vzipq_p8(poly8x16_t a, poly8x16_t b) {
-  // CHECK-LABEL: test_vzipq_p8
   return vzipq_p8(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-  // CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+// CHECK-LABEL: define %struct.poly16x8x2_t @test_vzipq_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.poly16x8x2_t [[TMP9]], 0
+// CHECK:   store [2 x <8 x i16>] [[TMP1]]1, [2 x <8 x i16>]* [[TMP1]]0, align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x2_t [[TMP1]]2
 poly16x8x2_t test_vzipq_p16(poly16x8_t a, poly16x8_t b) {
-  // CHECK-LABEL: test_vzipq_p16
   return vzipq_p16(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-  // CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define %struct.int8x8x2_t @test_vtrn_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.int8x8x2_t [[TMP5]], 0
+// CHECK:   store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8
+// CHECK:   [[TMP8:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x2_t [[TMP8]]
 int8x8x2_t test_vtrn_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vtrn_s8
   return vtrn_s8(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  // CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define %struct.int16x4x2_t @test_vtrn_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int16x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i16>] [[TMP1]]1, [2 x <4 x i16>]* [[TMP1]]0, align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x2_t [[TMP1]]2
 int16x4x2_t test_vtrn_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vtrn_s16
   return vtrn_s16(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-  // CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+// CHECK-LABEL: define %struct.int32x2x2_t @test_vtrn_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x i32> [[VTRN_I]], <2 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x i32> [[VTRN1_I]], <2 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int32x2x2_t [[TMP9]], 0
+// CHECK:   store [2 x <2 x i32>] [[TMP1]]1, [2 x <2 x i32>]* [[TMP1]]0, align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x2_t [[TMP1]]2
 int32x2x2_t test_vtrn_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vtrn_s32
   return vtrn_s32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}}
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}}
 }
+// CHECK-LABEL: define %struct.uint8x8x2_t @test_vtrn_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.uint8x8x2_t [[TMP5]], 0
+// CHECK:   store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8
+// CHECK:   [[TMP8:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x2_t [[TMP8]]
 uint8x8x2_t test_vtrn_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vtrn_u8
   return vtrn_u8(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  // CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+// CHECK-LABEL: define %struct.uint16x4x2_t @test_vtrn_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint16x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i16>] [[TMP1]]1, [2 x <4 x i16>]* [[TMP1]]0, align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x2_t [[TMP1]]2
 uint16x4x2_t test_vtrn_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vtrn_u16
   return vtrn_u16(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-  // CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+// CHECK-LABEL: define %struct.uint32x2x2_t @test_vtrn_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x i32> [[VTRN_I]], <2 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x i32> [[VTRN1_I]], <2 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint32x2x2_t [[TMP9]], 0
+// CHECK:   store [2 x <2 x i32>] [[TMP1]]1, [2 x <2 x i32>]* [[TMP1]]0, align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x2_t [[TMP1]]2
 uint32x2x2_t test_vtrn_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vtrn_u32
   return vtrn_u32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}}
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}}
 }
+// CHECK-LABEL: define %struct.float32x2x2_t @test_vtrn_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x float> [[VTRN_I]], <2 x float>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x float> [[VTRN1_I]], <2 x float>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.float32x2x2_t [[TMP9]], 0
+// CHECK:   store [2 x <2 x float>] [[TMP1]]1, [2 x <2 x float>]* [[TMP1]]0, align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x2_t [[TMP1]]2
 float32x2x2_t test_vtrn_f32(float32x2_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vtrn_f32
   return vtrn_f32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}}
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}}
 }
+// CHECK-LABEL: define %struct.poly8x8x2_t @test_vtrn_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.poly8x8x2_t [[TMP5]], 0
+// CHECK:   store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8
+// CHECK:   [[TMP8:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x2_t [[TMP8]]
 poly8x8x2_t test_vtrn_p8(poly8x8_t a, poly8x8_t b) {
-  // CHECK-LABEL: test_vtrn_p8
   return vtrn_p8(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  // CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+// CHECK-LABEL: define %struct.poly16x4x2_t @test_vtrn_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.poly16x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i16>] [[TMP1]]1, [2 x <4 x i16>]* [[TMP1]]0, align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x2_t [[TMP1]]2
 poly16x4x2_t test_vtrn_p16(poly16x4_t a, poly16x4_t b) {
-  // CHECK-LABEL: test_vtrn_p16
   return vtrn_p16(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-  // CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+// CHECK-LABEL: define %struct.int8x16x2_t @test_vtrnq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+// CHECK:   store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+// CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.int8x16x2_t [[TMP5]], 0
+// CHECK:   store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16
+// CHECK:   [[TMP8:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x2_t [[TMP8]]
 int8x16x2_t test_vtrnq_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vtrnq_s8
   return vtrnq_s8(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-  // CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+// CHECK-LABEL: define %struct.int16x8x2_t @test_vtrnq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int16x8x2_t [[TMP9]], 0
+// CHECK:   store [2 x <8 x i16>] [[TMP1]]1, [2 x <8 x i16>]* [[TMP1]]0, align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x2_t [[TMP1]]2
 int16x8x2_t test_vtrnq_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vtrnq_s16
   return vtrnq_s16(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-  // CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+// CHECK-LABEL: define %struct.int32x4x2_t @test_vtrnq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   store <4 x i32> [[VTRN_I]], <4 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int32x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i32>] [[TMP1]]1, [2 x <4 x i32>]* [[TMP1]]0, align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x2_t [[TMP1]]2
 int32x4x2_t test_vtrnq_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vtrnq_s32
   return vtrnq_s32(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+// CHECK-LABEL: define %struct.uint8x16x2_t @test_vtrnq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+// CHECK:   store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+// CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.uint8x16x2_t [[TMP5]], 0
+// CHECK:   store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16
+// CHECK:   [[TMP8:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x2_t [[TMP8]]
 uint8x16x2_t test_vtrnq_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vtrnq_u8
   return vtrnq_u8(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-  // CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+// CHECK-LABEL: define %struct.uint16x8x2_t @test_vtrnq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint16x8x2_t [[TMP9]], 0
+// CHECK:   store [2 x <8 x i16>] [[TMP1]]1, [2 x <8 x i16>]* [[TMP1]]0, align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x2_t [[TMP1]]2
 uint16x8x2_t test_vtrnq_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vtrnq_u16
   return vtrnq_u16(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-  // CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+// CHECK-LABEL: define %struct.uint32x4x2_t @test_vtrnq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   store <4 x i32> [[VTRN_I]], <4 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint32x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i32>] [[TMP1]]1, [2 x <4 x i32>]* [[TMP1]]0, align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x2_t [[TMP1]]2
 uint32x4x2_t test_vtrnq_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vtrnq_u32
   return vtrnq_u32(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+// CHECK-LABEL: define %struct.float32x4x2_t @test_vtrnq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   store <4 x float> [[VTRN_I]], <4 x float>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x float> [[VTRN1_I]], <4 x float>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.float32x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x float>] [[TMP1]]1, [2 x <4 x float>]* [[TMP1]]0, align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x2_t [[TMP1]]2
 float32x4x2_t test_vtrnq_f32(float32x4_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vtrnq_f32
   return vtrnq_f32(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+// CHECK-LABEL: define %struct.poly8x16x2_t @test_vtrnq_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+// CHECK:   store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+// CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.poly8x16x2_t [[TMP5]], 0
+// CHECK:   store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16
+// CHECK:   [[TMP8:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x2_t [[TMP8]]
 poly8x16x2_t test_vtrnq_p8(poly8x16_t a, poly8x16_t b) {
-  // CHECK-LABEL: test_vtrnq_p8
   return vtrnq_p8(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-  // CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+// CHECK-LABEL: define %struct.poly16x8x2_t @test_vtrnq_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.poly16x8x2_t [[TMP9]], 0
+// CHECK:   store [2 x <8 x i16>] [[TMP1]]1, [2 x <8 x i16>]* [[TMP1]]0, align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x2_t [[TMP1]]2
 poly16x8x2_t test_vtrnq_p16(poly16x8_t a, poly16x8_t b) {
-  // CHECK-LABEL: test_vtrnq_p16
   return vtrnq_p16(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-  // CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }

Modified: cfe/trunk/test/CodeGen/aarch64-neon-scalar-copy.c
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/CodeGen/aarch64-neon-scalar-copy.c?rev=263048&r1=263047&r2=263048&view=diff
==============================================================================
--- cfe/trunk/test/CodeGen/aarch64-neon-scalar-copy.c (original)
+++ cfe/trunk/test/CodeGen/aarch64-neon-scalar-copy.c Wed Mar  9 12:54:42 2016
@@ -1,173 +1,228 @@
-// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+// RUN:  -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 
 #include <arm_neon.h>
 
-// CHECK-LABEL: test_vdups_lane_f32
+// CHECK-LABEL: define float @test_vdups_lane_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VDUPS_LANE:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+// CHECK:   ret float [[VDUPS_LANE]]
 float32_t test_vdups_lane_f32(float32x2_t a) {
   return vdups_lane_f32(a, 1);
-// CHECK: ret
-// CHECK-NOT: dup {{s[0-9]+}}, {{v[0-9]+}}.s[1]
 }
 
 
-// CHECK-LABEL: test_vdupd_lane_f64
+// CHECK-LABEL: define double @test_vdupd_lane_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VDUPD_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
+// CHECK:   ret double [[VDUPD_LANE]]
 float64_t test_vdupd_lane_f64(float64x1_t a) {
   return vdupd_lane_f64(a, 0);
-// CHECK: ret
-// CHECK-NOT: dup {{d[0-9]+}}, {{v[0-9]+}}.d[0]
 }
 
 
-// CHECK-LABEL: test_vdups_laneq_f32
+// CHECK-LABEL: define float @test_vdups_laneq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+// CHECK:   ret float [[VGETQ_LANE]]
 float32_t test_vdups_laneq_f32(float32x4_t a) {
   return vdups_laneq_f32(a, 3);
-// CHECK: ret
-// CHECK-NOT: dup {{s[0-9]+}}, {{v[0-9]+}}.s[3]
 }
 
 
-// CHECK-LABEL: test_vdupd_laneq_f64
+// CHECK-LABEL: define double @test_vdupd_laneq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+// CHECK:   ret double [[VGETQ_LANE]]
 float64_t test_vdupd_laneq_f64(float64x2_t a) {
   return vdupd_laneq_f64(a, 1);
-// CHECK: ret
-// CHECK-NOT: dup {{d[0-9]+}}, {{v[0-9]+}}.d[1]
 }
 
 
-// CHECK-LABEL: test_vdupb_lane_s8
+// CHECK-LABEL: define i8 @test_vdupb_lane_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
+// CHECK:   ret i8 [[VGET_LANE]]
 int8_t test_vdupb_lane_s8(int8x8_t a) {
   return vdupb_lane_s8(a, 7);
-// CHECK: {{umov|smov}} {{w[0-9]+}}, {{v[0-9]+}}.b[7]
 }
 
 
-// CHECK-LABEL: test_vduph_lane_s16
+// CHECK-LABEL: define i16 @test_vduph_lane_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   ret i16 [[VGET_LANE]]
 int16_t test_vduph_lane_s16(int16x4_t a) {
   return vduph_lane_s16(a, 3);
-// CHECK: {{umov|smov}} {{w[0-9]+}}, {{v[0-9]+}}.h[3]
 }
 
 
-// CHECK-LABEL: test_vdups_lane_s32
+// CHECK-LABEL: define i32 @test_vdups_lane_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK:   ret i32 [[VGET_LANE]]
 int32_t test_vdups_lane_s32(int32x2_t a) {
   return vdups_lane_s32(a, 1);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.s[1]
 }
 
 
-// CHECK-LABEL: test_vdupd_lane_s64
+// CHECK-LABEL: define i64 @test_vdupd_lane_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
+// CHECK:   ret i64 [[VGET_LANE]]
 int64_t test_vdupd_lane_s64(int64x1_t a) {
   return vdupd_lane_s64(a, 0);
-// CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
 }
 
 
-// CHECK-LABEL: test_vdupb_lane_u8
+// CHECK-LABEL: define i8 @test_vdupb_lane_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
+// CHECK:   ret i8 [[VGET_LANE]]
 uint8_t test_vdupb_lane_u8(uint8x8_t a) {
   return vdupb_lane_u8(a, 7);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.b[7]
 }
 
 
-// CHECK-LABEL: test_vduph_lane_u16
+// CHECK-LABEL: define i16 @test_vduph_lane_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   ret i16 [[VGET_LANE]]
 uint16_t test_vduph_lane_u16(uint16x4_t a) {
   return vduph_lane_u16(a, 3);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.h[3]
 }
 
 
-// CHECK-LABEL: test_vdups_lane_u32
+// CHECK-LABEL: define i32 @test_vdups_lane_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK:   ret i32 [[VGET_LANE]]
 uint32_t test_vdups_lane_u32(uint32x2_t a) {
   return vdups_lane_u32(a, 1);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.s[1]
 }
 
 
-// CHECK-LABEL: test_vdupd_lane_u64
+// CHECK-LABEL: define i64 @test_vdupd_lane_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
+// CHECK:   ret i64 [[VGET_LANE]]
 uint64_t test_vdupd_lane_u64(uint64x1_t a) {
   return vdupd_lane_u64(a, 0);
-// CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
 }
 
-// CHECK-LABEL: test_vdupb_laneq_s8
+// CHECK-LABEL: define i8 @test_vdupb_laneq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
+// CHECK:   ret i8 [[VGETQ_LANE]]
 int8_t test_vdupb_laneq_s8(int8x16_t a) {
   return vdupb_laneq_s8(a, 15);
-// CHECK: {{umov|smov}} {{w[0-9]+}}, {{v[0-9]+}}.b[15]
 }
 
 
-// CHECK-LABEL: test_vduph_laneq_s16
+// CHECK-LABEL: define i16 @test_vduph_laneq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   ret i16 [[VGETQ_LANE]]
 int16_t test_vduph_laneq_s16(int16x8_t a) {
   return vduph_laneq_s16(a, 7);
-// CHECK: {{umov|smov}} {{w[0-9]+}}, {{v[0-9]+}}.h[7]
 }
 
 
-// CHECK-LABEL: test_vdups_laneq_s32
+// CHECK-LABEL: define i32 @test_vdups_laneq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK:   ret i32 [[VGETQ_LANE]]
 int32_t test_vdups_laneq_s32(int32x4_t a) {
   return vdups_laneq_s32(a, 3);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.s[3]
 }
 
 
-// CHECK-LABEL: test_vdupd_laneq_s64
+// CHECK-LABEL: define i64 @test_vdupd_laneq_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+// CHECK:   ret i64 [[VGETQ_LANE]]
 int64_t test_vdupd_laneq_s64(int64x2_t a) {
   return vdupd_laneq_s64(a, 1);
-// CHECK: {{mov|umov}} {{x[0-9]+}}, {{v[0-9]+}}.d[1]
 }
 
 
-// CHECK-LABEL: test_vdupb_laneq_u8
+// CHECK-LABEL: define i8 @test_vdupb_laneq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
+// CHECK:   ret i8 [[VGETQ_LANE]]
 uint8_t test_vdupb_laneq_u8(uint8x16_t a) {
   return vdupb_laneq_u8(a, 15);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.b[15]
 }
 
 
-// CHECK-LABEL: test_vduph_laneq_u16
+// CHECK-LABEL: define i16 @test_vduph_laneq_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   ret i16 [[VGETQ_LANE]]
 uint16_t test_vduph_laneq_u16(uint16x8_t a) {
   return vduph_laneq_u16(a, 7);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.h[7]
 }
 
 
-// CHECK-LABEL: test_vdups_laneq_u32
+// CHECK-LABEL: define i32 @test_vdups_laneq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK:   ret i32 [[VGETQ_LANE]]
 uint32_t test_vdups_laneq_u32(uint32x4_t a) {
   return vdups_laneq_u32(a, 3);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.s[3]
 }
 
 
-// CHECK-LABEL: test_vdupd_laneq_u64
+// CHECK-LABEL: define i64 @test_vdupd_laneq_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+// CHECK:   ret i64 [[VGETQ_LANE]]
 uint64_t test_vdupd_laneq_u64(uint64x2_t a) {
   return vdupd_laneq_u64(a, 1);
-// CHECK: {{mov|umov}} {{x[0-9]+}}, {{v[0-9]+}}.d[1]
 }
 
-// CHECK-LABEL: test_vdupb_lane_p8
+// CHECK-LABEL: define i8 @test_vdupb_lane_p8(<8 x i8> %a) #0 {
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
+// CHECK:   ret i8 [[VGET_LANE]]
 poly8_t test_vdupb_lane_p8(poly8x8_t a) {
   return vdupb_lane_p8(a, 7);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.b[7]
 }
 
-// CHECK-LABEL: test_vduph_lane_p16
+// CHECK-LABEL: define i16 @test_vduph_lane_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   ret i16 [[VGET_LANE]]
 poly16_t test_vduph_lane_p16(poly16x4_t a) {
   return vduph_lane_p16(a, 3);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.h[3]
 }
 
-// CHECK-LABEL: test_vdupb_laneq_p8
+// CHECK-LABEL: define i8 @test_vdupb_laneq_p8(<16 x i8> %a) #0 {
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
+// CHECK:   ret i8 [[VGETQ_LANE]]
 poly8_t test_vdupb_laneq_p8(poly8x16_t a) {
   return vdupb_laneq_p8(a, 15);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.b[15]
 }
 
-// CHECK-LABEL: test_vduph_laneq_p16
+// CHECK-LABEL: define i16 @test_vduph_laneq_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   ret i16 [[VGETQ_LANE]]
 poly16_t test_vduph_laneq_p16(poly16x8_t a) {
   return vduph_laneq_p16(a, 7);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.h[7]
 }