r263048 - ARM & AArch64: convert asm tests to LLVM IR and restrict optimizations.

Wed Mar 9 10:54:43 PST 2016

Modified: cfe/trunk/test/CodeGen/arm_neon_intrinsics.c
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/CodeGen/arm_neon_intrinsics.c?rev=263048&r1=263047&r2=263048&view=diff
==============================================================================

--- cfe/trunk/test/CodeGen/arm_neon_intrinsics.c (original)
+++ cfe/trunk/test/CodeGen/arm_neon_intrinsics.c Wed Mar  9 12:54:42 2016
@@ -1,1611 +1,2446 @@
 // RUN: %clang_cc1 -triple thumbv7s-apple-darwin -target-abi apcs-gnu\
-// RUN:  -target-cpu swift -ffreestanding -Os -S -o - %s\
-// RUN:  | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-SWIFT
-// RUN: %clang_cc1 -triple armv8-linux-gnu \
-// RUN:  -target-cpu cortex-a57 -mfloat-abi soft -ffreestanding -Os -S -o - %s\
-// RUN:  | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-A57
+// RUN:  -target-cpu swift -ffreestanding -emit-llvm -S -o - %s\
+// RUN:  opt -S -mem2reg | FileCheck %s
 
 // REQUIRES: long_tests
 
 #include <arm_neon.h>
 
-// CHECK-LABEL: test_vaba_s8
-// CHECK: vaba.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vaba_s8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, [[VABD_I_I]]
+// CHECK:   ret <8 x i8> [[ADD_I]]
 int8x8_t test_vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
   return vaba_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vaba_s16
-// CHECK: vaba.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vaba_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I_I]], <4 x i16> [[VABD1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[VABD2_I_I]]
+// CHECK:   ret <4 x i16> [[ADD_I]]
 int16x4_t test_vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
   return vaba_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vaba_s32
-// CHECK: vaba.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vaba_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I_I]], <2 x i32> [[VABD1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[VABD2_I_I]]
+// CHECK:   ret <2 x i32> [[ADD_I]]
 int32x2_t test_vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
   return vaba_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vaba_u8
-// CHECK: vaba.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vaba_u8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, [[VABD_I_I]]
+// CHECK:   ret <8 x i8> [[ADD_I]]
 uint8x8_t test_vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
   return vaba_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vaba_u16
-// CHECK: vaba.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vaba_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I_I]], <4 x i16> [[VABD1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[VABD2_I_I]]
+// CHECK:   ret <4 x i16> [[ADD_I]]
 uint16x4_t test_vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
   return vaba_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vaba_u32
-// CHECK: vaba.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vaba_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I_I]], <2 x i32> [[VABD1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[VABD2_I_I]]
+// CHECK:   ret <2 x i32> [[ADD_I]]
 uint32x2_t test_vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
   return vaba_u32(a, b, c);
 }
 
-// CHECK-LABEL: test_vabaq_s8
-// CHECK: vaba.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vabaq_s8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK:   [[VABD_I_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %b, <16 x i8> %c) #4
+// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, [[VABD_I_I]]
+// CHECK:   ret <16 x i8> [[ADD_I]]
 int8x16_t test_vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) {
   return vabaq_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vabaq_s16
-// CHECK: vaba.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vabaq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %c to <16 x i8>
+// CHECK:   [[VABD_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VABD1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VABD2_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> [[VABD_I_I]], <8 x i16> [[VABD1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VABD2_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
   return vabaq_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vabaq_s32
-// CHECK: vaba.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vabaq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %c to <16 x i8>
+// CHECK:   [[VABD_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VABD1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VABD2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> [[VABD_I_I]], <4 x i32> [[VABD1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VABD2_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
   return vabaq_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vabaq_u8
-// CHECK: vaba.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vabaq_u8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK:   [[VABD_I_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %b, <16 x i8> %c) #4
+// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, [[VABD_I_I]]
+// CHECK:   ret <16 x i8> [[ADD_I]]
 uint8x16_t test_vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
   return vabaq_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vabaq_u16
-// CHECK: vaba.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vabaq_u16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %c to <16 x i8>
+// CHECK:   [[VABD_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VABD1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VABD2_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> [[VABD_I_I]], <8 x i16> [[VABD1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VABD2_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) {
   return vabaq_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vabaq_u32
-// CHECK: vaba.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vabaq_u32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %c to <16 x i8>
+// CHECK:   [[VABD_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VABD1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VABD2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> [[VABD_I_I]], <4 x i32> [[VABD1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VABD2_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) {
   return vabaq_u32(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vabal_s8
-// CHECK: vabal.s8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vabal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
   return vabal_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vabal_s16
-// CHECK: vabal.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vabal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I_I_I]], <4 x i16> [[VABD1_I_I_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vabal_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vabal_s32
-// CHECK: vabal.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vabal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I_I_I]], <2 x i32> [[VABD1_I_I_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vabal_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vabal_u8
-// CHECK: vabal.u8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vabal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
   return vabal_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vabal_u16
-// CHECK: vabal.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vabal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I_I_I]], <4 x i16> [[VABD1_I_I_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
   return vabal_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vabal_u32
-// CHECK: vabal.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vabal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I_I_I]], <2 x i32> [[VABD1_I_I_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
   return vabal_u32(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vabd_s8
-// CHECK: vabd.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vabd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VABD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VABD_I]]
 int8x8_t test_vabd_s8(int8x8_t a, int8x8_t b) {
   return vabd_s8(a, b);
 }
 
-// CHECK-LABEL: test_vabd_s16
-// CHECK: vabd.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vabd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VABD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I]], <4 x i16> [[VABD1_I]]) #4
+// CHECK:   ret <4 x i16> [[VABD2_I]]
 int16x4_t test_vabd_s16(int16x4_t a, int16x4_t b) {
   return vabd_s16(a, b);
 }
 
-// CHECK-LABEL: test_vabd_s32
-// CHECK: vabd.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vabd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VABD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I]], <2 x i32> [[VABD1_I]]) #4
+// CHECK:   ret <2 x i32> [[VABD2_I]]
 int32x2_t test_vabd_s32(int32x2_t a, int32x2_t b) {
   return vabd_s32(a, b);
 }
 
-// CHECK-LABEL: test_vabd_u8
-// CHECK: vabd.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vabd_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VABD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VABD_I]]
 uint8x8_t test_vabd_u8(uint8x8_t a, uint8x8_t b) {
   return vabd_u8(a, b);
 }
 
-// CHECK-LABEL: test_vabd_u16
-// CHECK: vabd.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vabd_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VABD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I]], <4 x i16> [[VABD1_I]]) #4
+// CHECK:   ret <4 x i16> [[VABD2_I]]
 uint16x4_t test_vabd_u16(uint16x4_t a, uint16x4_t b) {
   return vabd_u16(a, b);
 }
 
-// CHECK-LABEL: test_vabd_u32
-// CHECK: vabd.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vabd_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VABD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I]], <2 x i32> [[VABD1_I]]) #4
+// CHECK:   ret <2 x i32> [[VABD2_I]]
 uint32x2_t test_vabd_u32(uint32x2_t a, uint32x2_t b) {
   return vabd_u32(a, b);
 }
 
-// CHECK-LABEL: test_vabd_f32
-// CHECK: vabd.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vabd_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VABD2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float> [[VABD_I]], <2 x float> [[VABD1_I]]) #4
+// CHECK:   ret <2 x float> [[VABD2_I]]
 float32x2_t test_vabd_f32(float32x2_t a, float32x2_t b) {
   return vabd_f32(a, b);
 }
 
-// CHECK-LABEL: test_vabdq_s8
-// CHECK: vabd.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vabdq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VABD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VABD_I]]
 int8x16_t test_vabdq_s8(int8x16_t a, int8x16_t b) {
   return vabdq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vabdq_s16
-// CHECK: vabd.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vabdq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VABD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> [[VABD_I]], <8 x i16> [[VABD1_I]]) #4
+// CHECK:   ret <8 x i16> [[VABD2_I]]
 int16x8_t test_vabdq_s16(int16x8_t a, int16x8_t b) {
   return vabdq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vabdq_s32
-// CHECK: vabd.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vabdq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VABD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> [[VABD_I]], <4 x i32> [[VABD1_I]]) #4
+// CHECK:   ret <4 x i32> [[VABD2_I]]
 int32x4_t test_vabdq_s32(int32x4_t a, int32x4_t b) {
   return vabdq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vabdq_u8
-// CHECK: vabd.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vabdq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VABD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VABD_I]]
 uint8x16_t test_vabdq_u8(uint8x16_t a, uint8x16_t b) {
   return vabdq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vabdq_u16
-// CHECK: vabd.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vabdq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VABD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> [[VABD_I]], <8 x i16> [[VABD1_I]]) #4
+// CHECK:   ret <8 x i16> [[VABD2_I]]
 uint16x8_t test_vabdq_u16(uint16x8_t a, uint16x8_t b) {
   return vabdq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vabdq_u32
-// CHECK: vabd.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vabdq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VABD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> [[VABD_I]], <4 x i32> [[VABD1_I]]) #4
+// CHECK:   ret <4 x i32> [[VABD2_I]]
 uint32x4_t test_vabdq_u32(uint32x4_t a, uint32x4_t b) {
   return vabdq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vabdq_f32
-// CHECK: vabd.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vabdq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VABD2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float> [[VABD_I]], <4 x float> [[VABD1_I]]) #4
+// CHECK:   ret <4 x float> [[VABD2_I]]
 float32x4_t test_vabdq_f32(float32x4_t a, float32x4_t b) {
   return vabdq_f32(a, b);
 }
 
 
-// CHECK-LABEL: test_vabdl_s8
-// CHECK: vabdl.s8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vabdl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_I_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VMOVL_I_I]]
 int16x8_t test_vabdl_s8(int8x8_t a, int8x8_t b) {
   return vabdl_s8(a, b);
 }
 
-// CHECK-LABEL: test_vabdl_s16
-// CHECK: vabdl.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vabdl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I_I]], <4 x i16> [[VABD1_I_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VMOVL_I_I]]
 int32x4_t test_vabdl_s16(int16x4_t a, int16x4_t b) {
   return vabdl_s16(a, b);
 }
 
-// CHECK-LABEL: test_vabdl_s32
-// CHECK: vabdl.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vabdl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I_I]], <2 x i32> [[VABD1_I_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VMOVL_I_I]]
 int64x2_t test_vabdl_s32(int32x2_t a, int32x2_t b) {
   return vabdl_s32(a, b);
 }
 
-// CHECK-LABEL: test_vabdl_u8
-// CHECK: vabdl.u8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vabdl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_I_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VMOVL_I_I]]
 uint16x8_t test_vabdl_u8(uint8x8_t a, uint8x8_t b) {
   return vabdl_u8(a, b);
 }
 
-// CHECK-LABEL: test_vabdl_u16
-// CHECK: vabdl.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vabdl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I_I]], <4 x i16> [[VABD1_I_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VMOVL_I_I]]
 uint32x4_t test_vabdl_u16(uint16x4_t a, uint16x4_t b) {
   return vabdl_u16(a, b);
 }
 
-// CHECK-LABEL: test_vabdl_u32
-// CHECK: vabdl.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vabdl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I_I]], <2 x i32> [[VABD1_I_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VMOVL_I_I]]
 uint64x2_t test_vabdl_u32(uint32x2_t a, uint32x2_t b) {
   return vabdl_u32(a, b);
 }
 
 
-// CHECK-LABEL: test_vabs_s8
-// CHECK: vabs.s8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vabs_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VABS_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8> %a) #4
+// CHECK:   ret <8 x i8> [[VABS_I]]
 int8x8_t test_vabs_s8(int8x8_t a) {
   return vabs_s8(a);
 }
 
-// CHECK-LABEL: test_vabs_s16
-// CHECK: vabs.s16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vabs_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VABS1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16> [[VABS_I]]) #4
+// CHECK:   ret <4 x i16> [[VABS1_I]]
 int16x4_t test_vabs_s16(int16x4_t a) {
   return vabs_s16(a);
 }
 
-// CHECK-LABEL: test_vabs_s32
-// CHECK: vabs.s32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vabs_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VABS1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32> [[VABS_I]]) #4
+// CHECK:   ret <2 x i32> [[VABS1_I]]
 int32x2_t test_vabs_s32(int32x2_t a) {
   return vabs_s32(a);
 }
 
-// CHECK-LABEL: test_vabs_f32
-// CHECK: vabs.f32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vabs_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VABS1_I:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> [[VABS_I]]) #4
+// CHECK:   ret <2 x float> [[VABS1_I]]
 float32x2_t test_vabs_f32(float32x2_t a) {
   return vabs_f32(a);
 }
 
-// CHECK-LABEL: test_vabsq_s8
-// CHECK: vabs.s8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vabsq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VABS_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8> %a) #4
+// CHECK:   ret <16 x i8> [[VABS_I]]
 int8x16_t test_vabsq_s8(int8x16_t a) {
   return vabsq_s8(a);
 }
 
-// CHECK-LABEL: test_vabsq_s16
-// CHECK: vabs.s16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vabsq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VABS1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16> [[VABS_I]]) #4
+// CHECK:   ret <8 x i16> [[VABS1_I]]
 int16x8_t test_vabsq_s16(int16x8_t a) {
   return vabsq_s16(a);
 }
 
-// CHECK-LABEL: test_vabsq_s32
-// CHECK: vabs.s32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vabsq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VABS1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32> [[VABS_I]]) #4
+// CHECK:   ret <4 x i32> [[VABS1_I]]
 int32x4_t test_vabsq_s32(int32x4_t a) {
   return vabsq_s32(a);
 }
 
-// CHECK-LABEL: test_vabsq_f32
-// CHECK: vabs.f32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vabsq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VABS1_I:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[VABS_I]]) #4
+// CHECK:   ret <4 x float> [[VABS1_I]]
 float32x4_t test_vabsq_f32(float32x4_t a) {
   return vabsq_f32(a);
 }
 
 
-// CHECK-LABEL: test_vadd_s8
-// CHECK: vadd.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vadd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[ADD_I]]
 int8x8_t test_vadd_s8(int8x8_t a, int8x8_t b) {
   return vadd_s8(a, b);
 }
 
-// CHECK-LABEL: test_vadd_s16
-// CHECK: vadd.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vadd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[ADD_I]]
 int16x4_t test_vadd_s16(int16x4_t a, int16x4_t b) {
   return vadd_s16(a, b);
 }
 
-// CHECK-LABEL: test_vadd_s32
-// CHECK: vadd.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vadd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[ADD_I]]
 int32x2_t test_vadd_s32(int32x2_t a, int32x2_t b) {
   return vadd_s32(a, b);
 }
 
-// CHECK-LABEL: test_vadd_s64
-// CHECK: vadd.i64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vadd_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[ADD_I]]
 int64x1_t test_vadd_s64(int64x1_t a, int64x1_t b) {
   return vadd_s64(a, b);
 }
 
-// CHECK-LABEL: test_vadd_f32
-// CHECK: vadd.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vadd_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = fadd <2 x float> %a, %b
+// CHECK:   ret <2 x float> [[ADD_I]]
 float32x2_t test_vadd_f32(float32x2_t a, float32x2_t b) {
   return vadd_f32(a, b);
 }
 
-// CHECK-LABEL: test_vadd_u8
-// CHECK: vadd.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vadd_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[ADD_I]]
 uint8x8_t test_vadd_u8(uint8x8_t a, uint8x8_t b) {
   return vadd_u8(a, b);
 }
 
-// CHECK-LABEL: test_vadd_u16
-// CHECK: vadd.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vadd_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[ADD_I]]
 uint16x4_t test_vadd_u16(uint16x4_t a, uint16x4_t b) {
   return vadd_u16(a, b);
 }
 
-// CHECK-LABEL: test_vadd_u32
-// CHECK: vadd.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vadd_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[ADD_I]]
 uint32x2_t test_vadd_u32(uint32x2_t a, uint32x2_t b) {
   return vadd_u32(a, b);
 }
 
-// CHECK-LABEL: test_vadd_u64
-// CHECK: vadd.i64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vadd_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[ADD_I]]
 uint64x1_t test_vadd_u64(uint64x1_t a, uint64x1_t b) {
   return vadd_u64(a, b);
 }
 
-// CHECK-LABEL: test_vaddq_s8
-// CHECK: vadd.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vaddq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[ADD_I]]
 int8x16_t test_vaddq_s8(int8x16_t a, int8x16_t b) {
   return vaddq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vaddq_s16
-// CHECK: vadd.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vaddq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vaddq_s16(int16x8_t a, int16x8_t b) {
   return vaddq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vaddq_s32
-// CHECK: vadd.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vaddq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vaddq_s32(int32x4_t a, int32x4_t b) {
   return vaddq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vaddq_s64
-// CHECK: vadd.i64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vaddq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vaddq_s64(int64x2_t a, int64x2_t b) {
   return vaddq_s64(a, b);
 }
 
-// CHECK-LABEL: test_vaddq_f32
-// CHECK: vadd.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vaddq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = fadd <4 x float> %a, %b
+// CHECK:   ret <4 x float> [[ADD_I]]
 float32x4_t test_vaddq_f32(float32x4_t a, float32x4_t b) {
   return vaddq_f32(a, b);
 }
 
-// CHECK-LABEL: test_vaddq_u8
-// CHECK: vadd.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vaddq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[ADD_I]]
 uint8x16_t test_vaddq_u8(uint8x16_t a, uint8x16_t b) {
   return vaddq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vaddq_u16
-// CHECK: vadd.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vaddq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vaddq_u16(uint16x8_t a, uint16x8_t b) {
   return vaddq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vaddq_u32
-// CHECK: vadd.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vaddq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vaddq_u32(uint32x4_t a, uint32x4_t b) {
   return vaddq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vaddq_u64
-// CHECK: vadd.i64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vaddq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vaddq_u64(uint64x2_t a, uint64x2_t b) {
   return vaddq_u64(a, b);
 }
 
 
-// CHECK-LABEL: test_vaddhn_s16
-// CHECK: vaddhn.i16 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vaddhn_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VADDHN_I:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK:   [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VADDHN2_I]]
 int8x8_t test_vaddhn_s16(int16x8_t a, int16x8_t b) {
   return vaddhn_s16(a, b);
 }
 
-// CHECK-LABEL: test_vaddhn_s32
-// CHECK: vaddhn.i32 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vaddhn_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VADDHN_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK:   [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VADDHN2_I]]
 int16x4_t test_vaddhn_s32(int32x4_t a, int32x4_t b) {
   return vaddhn_s32(a, b);
 }
 
-// CHECK-LABEL: test_vaddhn_s64
-// CHECK: vaddhn.i64 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vaddhn_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VADDHN_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], <i64 32, i64 32>
+// CHECK:   [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VADDHN2_I]]
 int32x2_t test_vaddhn_s64(int64x2_t a, int64x2_t b) {
   return vaddhn_s64(a, b);
 }
 
-// CHECK-LABEL: test_vaddhn_u16
-// CHECK: vaddhn.i16 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vaddhn_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VADDHN_I:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK:   [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VADDHN2_I]]
 uint8x8_t test_vaddhn_u16(uint16x8_t a, uint16x8_t b) {
   return vaddhn_u16(a, b);
 }
 
-// CHECK-LABEL: test_vaddhn_u32
-// CHECK: vaddhn.i32 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vaddhn_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VADDHN_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK:   [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VADDHN2_I]]
 uint16x4_t test_vaddhn_u32(uint32x4_t a, uint32x4_t b) {
   return vaddhn_u32(a, b);
 }
 
-// CHECK-LABEL: test_vaddhn_u64
-// CHECK: vaddhn.i64 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vaddhn_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VADDHN_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], <i64 32, i64 32>
+// CHECK:   [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VADDHN2_I]]
 uint32x2_t test_vaddhn_u64(uint64x2_t a, uint64x2_t b) {
   return vaddhn_u64(a, b);
 }
 
 
-// CHECK-LABEL: test_vaddl_s8
-// CHECK: vaddl.s8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vaddl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %a to <8 x i16>
+// CHECK:   [[VMOVL_I4_I:%.*]] = sext <8 x i8> %b to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vaddl_s8(int8x8_t a, int8x8_t b) {
   return vaddl_s8(a, b);
 }
 
-// CHECK-LABEL: test_vaddl_s16
-// CHECK: vaddl.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vaddl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VMOVL_I4_I:%.*]] = sext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vaddl_s16(int16x4_t a, int16x4_t b) {
   return vaddl_s16(a, b);
 }
 
-// CHECK-LABEL: test_vaddl_s32
-// CHECK: vaddl.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vaddl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VMOVL_I4_I:%.*]] = sext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vaddl_s32(int32x2_t a, int32x2_t b) {
   return vaddl_s32(a, b);
 }
 
-// CHECK-LABEL: test_vaddl_u8
-// CHECK: vaddl.u8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vaddl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %a to <8 x i16>
+// CHECK:   [[VMOVL_I4_I:%.*]] = zext <8 x i8> %b to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vaddl_u8(uint8x8_t a, uint8x8_t b) {
   return vaddl_u8(a, b);
 }
 
-// CHECK-LABEL: test_vaddl_u16
-// CHECK: vaddl.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vaddl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VMOVL_I4_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vaddl_u16(uint16x4_t a, uint16x4_t b) {
   return vaddl_u16(a, b);
 }
 
-// CHECK-LABEL: test_vaddl_u32
-// CHECK: vaddl.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vaddl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VMOVL_I4_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vaddl_u32(uint32x2_t a, uint32x2_t b) {
   return vaddl_u32(a, b);
 }
 
 
-// CHECK-LABEL: test_vaddw_s8
-// CHECK: vaddw.s8 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vaddw_s8(<8 x i16> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %b to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vaddw_s8(int16x8_t a, int8x8_t b) {
   return vaddw_s8(a, b);
 }
 
-// CHECK-LABEL: test_vaddw_s16
-// CHECK: vaddw.s16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vaddw_s16(<4 x i32> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vaddw_s16(int32x4_t a, int16x4_t b) {
   return vaddw_s16(a, b);
 }
 
-// CHECK-LABEL: test_vaddw_s32
-// CHECK: vaddw.s32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vaddw_s32(<2 x i64> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vaddw_s32(int64x2_t a, int32x2_t b) {
   return vaddw_s32(a, b);
 }
 
-// CHECK-LABEL: test_vaddw_u8
-// CHECK: vaddw.u8 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vaddw_u8(<8 x i16> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %b to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vaddw_u8(uint16x8_t a, uint8x8_t b) {
   return vaddw_u8(a, b);
 }
 
-// CHECK-LABEL: test_vaddw_u16
-// CHECK: vaddw.u16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vaddw_u16(<4 x i32> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vaddw_u16(uint32x4_t a, uint16x4_t b) {
   return vaddw_u16(a, b);
 }
 
-// CHECK-LABEL: test_vaddw_u32
-// CHECK: vaddw.u32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vaddw_u32(<2 x i64> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vaddw_u32(uint64x2_t a, uint32x2_t b) {
   return vaddw_u32(a, b);
 }
 
 
-// CHECK-LABEL: test_vand_s8
-// CHECK: vand d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vand_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[AND_I]]
 int8x8_t test_vand_s8(int8x8_t a, int8x8_t b) {
   return vand_s8(a, b);
 }
 
-// CHECK-LABEL: test_vand_s16
-// CHECK: vand d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vand_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[AND_I]]
 int16x4_t test_vand_s16(int16x4_t a, int16x4_t b) {
   return vand_s16(a, b);
 }
 
-// CHECK-LABEL: test_vand_s32
-// CHECK: vand d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vand_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[AND_I]]
 int32x2_t test_vand_s32(int32x2_t a, int32x2_t b) {
   return vand_s32(a, b);
 }
 
-// CHECK-LABEL: test_vand_s64
-// CHECK: vand d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vand_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[AND_I]]
 int64x1_t test_vand_s64(int64x1_t a, int64x1_t b) {
   return vand_s64(a, b);
 }
 
-// CHECK-LABEL: test_vand_u8
-// CHECK: vand d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vand_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[AND_I]]
 uint8x8_t test_vand_u8(uint8x8_t a, uint8x8_t b) {
   return vand_u8(a, b);
 }
 
-// CHECK-LABEL: test_vand_u16
-// CHECK: vand d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vand_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[AND_I]]
 uint16x4_t test_vand_u16(uint16x4_t a, uint16x4_t b) {
   return vand_u16(a, b);
 }
 
-// CHECK-LABEL: test_vand_u32
-// CHECK: vand d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vand_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[AND_I]]
 uint32x2_t test_vand_u32(uint32x2_t a, uint32x2_t b) {
   return vand_u32(a, b);
 }
 
-// CHECK-LABEL: test_vand_u64
-// CHECK: vand d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vand_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[AND_I]]
 uint64x1_t test_vand_u64(uint64x1_t a, uint64x1_t b) {
   return vand_u64(a, b);
 }
 
-// CHECK-LABEL: test_vandq_s8
-// CHECK: vand q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vandq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[AND_I]]
 int8x16_t test_vandq_s8(int8x16_t a, int8x16_t b) {
   return vandq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vandq_s16
-// CHECK: vand q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vandq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[AND_I]]
 int16x8_t test_vandq_s16(int16x8_t a, int16x8_t b) {
   return vandq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vandq_s32
-// CHECK: vand q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vandq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[AND_I]]
 int32x4_t test_vandq_s32(int32x4_t a, int32x4_t b) {
   return vandq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vandq_s64
-// CHECK: vand q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vandq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[AND_I]]
 int64x2_t test_vandq_s64(int64x2_t a, int64x2_t b) {
   return vandq_s64(a, b);
 }
 
-// CHECK-LABEL: test_vandq_u8
-// CHECK: vand q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vandq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[AND_I]]
 uint8x16_t test_vandq_u8(uint8x16_t a, uint8x16_t b) {
   return vandq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vandq_u16
-// CHECK: vand q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vandq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[AND_I]]
 uint16x8_t test_vandq_u16(uint16x8_t a, uint16x8_t b) {
   return vandq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vandq_u32
-// CHECK: vand q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vandq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[AND_I]]
 uint32x4_t test_vandq_u32(uint32x4_t a, uint32x4_t b) {
   return vandq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vandq_u64
-// CHECK: vand q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vandq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[AND_I]]
 uint64x2_t test_vandq_u64(uint64x2_t a, uint64x2_t b) {
   return vandq_u64(a, b);
 }
 
 
-// CHECK-LABEL: test_vbic_s8
-// CHECK: vbic d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vbic_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, [[NEG_I]]
+// CHECK:   ret <8 x i8> [[AND_I]]
 int8x8_t test_vbic_s8(int8x8_t a, int8x8_t b) {
   return vbic_s8(a, b);
 }
 
-// CHECK-LABEL: test_vbic_s16
-// CHECK: vbic d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vbic_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, [[NEG_I]]
+// CHECK:   ret <4 x i16> [[AND_I]]
 int16x4_t test_vbic_s16(int16x4_t a, int16x4_t b) {
   return vbic_s16(a, b);
 }
 
-// CHECK-LABEL: test_vbic_s32
-// CHECK: vbic d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vbic_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
+// CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, [[NEG_I]]
+// CHECK:   ret <2 x i32> [[AND_I]]
 int32x2_t test_vbic_s32(int32x2_t a, int32x2_t b) {
   return vbic_s32(a, b);
 }
 
-// CHECK-LABEL: test_vbic_s64
-// CHECK: vbic d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vbic_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
+// CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, [[NEG_I]]
+// CHECK:   ret <1 x i64> [[AND_I]]
 int64x1_t test_vbic_s64(int64x1_t a, int64x1_t b) {
   return vbic_s64(a, b);
 }
 
-// CHECK-LABEL: test_vbic_u8
-// CHECK: vbic d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vbic_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, [[NEG_I]]
+// CHECK:   ret <8 x i8> [[AND_I]]
 uint8x8_t test_vbic_u8(uint8x8_t a, uint8x8_t b) {
   return vbic_u8(a, b);
 }
 
-// CHECK-LABEL: test_vbic_u16
-// CHECK: vbic d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vbic_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, [[NEG_I]]
+// CHECK:   ret <4 x i16> [[AND_I]]
 uint16x4_t test_vbic_u16(uint16x4_t a, uint16x4_t b) {
   return vbic_u16(a, b);
 }
 
-// CHECK-LABEL: test_vbic_u32
-// CHECK: vbic d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vbic_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
+// CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, [[NEG_I]]
+// CHECK:   ret <2 x i32> [[AND_I]]
 uint32x2_t test_vbic_u32(uint32x2_t a, uint32x2_t b) {
   return vbic_u32(a, b);
 }
 
-// CHECK-LABEL: test_vbic_u64
-// CHECK: vbic d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vbic_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
+// CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, [[NEG_I]]
+// CHECK:   ret <1 x i64> [[AND_I]]
 uint64x1_t test_vbic_u64(uint64x1_t a, uint64x1_t b) {
   return vbic_u64(a, b);
 }
 
-// CHECK-LABEL: test_vbicq_s8
-// CHECK: vbic q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vbicq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, [[NEG_I]]
+// CHECK:   ret <16 x i8> [[AND_I]]
 int8x16_t test_vbicq_s8(int8x16_t a, int8x16_t b) {
   return vbicq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vbicq_s16
-// CHECK: vbic q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vbicq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, [[NEG_I]]
+// CHECK:   ret <8 x i16> [[AND_I]]
 int16x8_t test_vbicq_s16(int16x8_t a, int16x8_t b) {
   return vbicq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vbicq_s32
-// CHECK: vbic q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vbicq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, [[NEG_I]]
+// CHECK:   ret <4 x i32> [[AND_I]]
 int32x4_t test_vbicq_s32(int32x4_t a, int32x4_t b) {
   return vbicq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vbicq_s64
-// CHECK: vbic q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vbicq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
+// CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, [[NEG_I]]
+// CHECK:   ret <2 x i64> [[AND_I]]
 int64x2_t test_vbicq_s64(int64x2_t a, int64x2_t b) {
   return vbicq_s64(a, b);
 }
 
-// CHECK-LABEL: test_vbicq_u8
-// CHECK: vbic q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vbicq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, [[NEG_I]]
+// CHECK:   ret <16 x i8> [[AND_I]]
 uint8x16_t test_vbicq_u8(uint8x16_t a, uint8x16_t b) {
   return vbicq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vbicq_u16
-// CHECK: vbic q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vbicq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, [[NEG_I]]
+// CHECK:   ret <8 x i16> [[AND_I]]
 uint16x8_t test_vbicq_u16(uint16x8_t a, uint16x8_t b) {
   return vbicq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vbicq_u32
-// CHECK: vbic q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vbicq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, [[NEG_I]]
+// CHECK:   ret <4 x i32> [[AND_I]]
 uint32x4_t test_vbicq_u32(uint32x4_t a, uint32x4_t b) {
   return vbicq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vbicq_u64
-// CHECK: vbic q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vbicq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
+// CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, [[NEG_I]]
+// CHECK:   ret <2 x i64> [[AND_I]]
 uint64x2_t test_vbicq_u64(uint64x2_t a, uint64x2_t b) {
   return vbicq_u64(a, b);
 }
 
 
-// CHECK-LABEL: test_vbsl_s8
-// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vbsl_s8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VBSL_I:%.*]] = and <8 x i8> %a, %b
+// CHECK:   [[TMP0:%.*]] = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[VBSL1_I:%.*]] = and <8 x i8> [[TMP0]], %c
+// CHECK:   [[VBSL2_I:%.*]] = or <8 x i8> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   ret <8 x i8> [[VBSL2_I]]
 int8x8_t test_vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c) {
   return vbsl_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vbsl_s16
-// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vbsl_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VBSL3_I:%.*]] = and <4 x i16> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <4 x i16> [[VBSL_I]], <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <4 x i16> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <4 x i16> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <4 x i16> [[VBSL5_I]]
 int16x4_t test_vbsl_s16(uint16x4_t a, int16x4_t b, int16x4_t c) {
   return vbsl_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vbsl_s32
-// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vbsl_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VBSL3_I:%.*]] = and <2 x i32> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <2 x i32> [[VBSL_I]], <i32 -1, i32 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <2 x i32> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <2 x i32> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <2 x i32> [[VBSL5_I]]
 int32x2_t test_vbsl_s32(uint32x2_t a, int32x2_t b, int32x2_t c) {
   return vbsl_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vbsl_s64
-// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vbsl_s64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x i64> %c to <8 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64>
+// CHECK:   [[VBSL3_I:%.*]] = and <1 x i64> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <1 x i64> [[VBSL_I]], <i64 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <1 x i64> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <1 x i64> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <1 x i64> [[VBSL5_I]]
 int64x1_t test_vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c) {
   return vbsl_s64(a, b, c);
 }
 
-// CHECK-LABEL: test_vbsl_u8
-// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vbsl_u8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VBSL_I:%.*]] = and <8 x i8> %a, %b
+// CHECK:   [[TMP0:%.*]] = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[VBSL1_I:%.*]] = and <8 x i8> [[TMP0]], %c
+// CHECK:   [[VBSL2_I:%.*]] = or <8 x i8> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   ret <8 x i8> [[VBSL2_I]]
 uint8x8_t test_vbsl_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
   return vbsl_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vbsl_u16
-// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vbsl_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VBSL3_I:%.*]] = and <4 x i16> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <4 x i16> [[VBSL_I]], <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <4 x i16> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <4 x i16> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <4 x i16> [[VBSL5_I]]
 uint16x4_t test_vbsl_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
   return vbsl_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vbsl_u32
-// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vbsl_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VBSL3_I:%.*]] = and <2 x i32> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <2 x i32> [[VBSL_I]], <i32 -1, i32 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <2 x i32> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <2 x i32> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <2 x i32> [[VBSL5_I]]
 uint32x2_t test_vbsl_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
   return vbsl_u32(a, b, c);
 }
 
-// CHECK-LABEL: test_vbsl_u64
-// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vbsl_u64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x i64> %c to <8 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64>
+// CHECK:   [[VBSL3_I:%.*]] = and <1 x i64> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <1 x i64> [[VBSL_I]], <i64 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <1 x i64> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <1 x i64> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <1 x i64> [[VBSL5_I]]
 uint64x1_t test_vbsl_u64(uint64x1_t a, uint64x1_t b, uint64x1_t c) {
   return vbsl_u64(a, b, c);
 }
 
-// CHECK-LABEL: test_vbsl_f32
-// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vbsl_f32(<2 x i32> %a, <2 x float> %b, <2 x float> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %c to <8 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VBSL3_I:%.*]] = and <2 x i32> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <2 x i32> [[VBSL_I]], <i32 -1, i32 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <2 x i32> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <2 x i32> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[VBSL5_I]] to <2 x float>
+// CHECK:   ret <2 x float> [[TMP4]]
 float32x2_t test_vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c) {
   return vbsl_f32(a, b, c);
 }
 
-// CHECK-LABEL: test_vbsl_p8
-// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vbsl_p8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VBSL_I:%.*]] = and <8 x i8> %a, %b
+// CHECK:   [[TMP0:%.*]] = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[VBSL1_I:%.*]] = and <8 x i8> [[TMP0]], %c
+// CHECK:   [[VBSL2_I:%.*]] = or <8 x i8> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   ret <8 x i8> [[VBSL2_I]]
 poly8x8_t test_vbsl_p8(uint8x8_t a, poly8x8_t b, poly8x8_t c) {
   return vbsl_p8(a, b, c);
 }
 
-// CHECK-LABEL: test_vbsl_p16
-// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vbsl_p16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VBSL3_I:%.*]] = and <4 x i16> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <4 x i16> [[VBSL_I]], <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <4 x i16> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <4 x i16> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <4 x i16> [[VBSL5_I]]
 poly16x4_t test_vbsl_p16(uint16x4_t a, poly16x4_t b, poly16x4_t c) {
   return vbsl_p16(a, b, c);
 }
 
-// CHECK-LABEL: test_vbslq_s8
-// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vbslq_s8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK:   [[VBSL_I:%.*]] = and <16 x i8> %a, %b
+// CHECK:   [[TMP0:%.*]] = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[VBSL1_I:%.*]] = and <16 x i8> [[TMP0]], %c
+// CHECK:   [[VBSL2_I:%.*]] = or <16 x i8> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   ret <16 x i8> [[VBSL2_I]]
 int8x16_t test_vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c) {
   return vbslq_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vbslq_s16
-// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vbslq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %c to <16 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VBSL3_I:%.*]] = and <8 x i16> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <8 x i16> [[VBSL_I]], <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <8 x i16> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <8 x i16> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <8 x i16> [[VBSL5_I]]
 int16x8_t test_vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c) {
   return vbslq_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vbslq_s32
-// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vbslq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %c to <16 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK:   [[VBSL3_I:%.*]] = and <4 x i32> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <4 x i32> [[VBSL_I]], <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <4 x i32> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <4 x i32> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <4 x i32> [[VBSL5_I]]
 int32x4_t test_vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c) {
   return vbslq_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vbslq_s64
-// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vbslq_s64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %c to <16 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
+// CHECK:   [[VBSL3_I:%.*]] = and <2 x i64> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <2 x i64> [[VBSL_I]], <i64 -1, i64 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <2 x i64> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <2 x i64> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <2 x i64> [[VBSL5_I]]
 int64x2_t test_vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c) {
   return vbslq_s64(a, b, c);
 }
 
-// CHECK-LABEL: test_vbslq_u8
-// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vbslq_u8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK:   [[VBSL_I:%.*]] = and <16 x i8> %a, %b
+// CHECK:   [[TMP0:%.*]] = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[VBSL1_I:%.*]] = and <16 x i8> [[TMP0]], %c
+// CHECK:   [[VBSL2_I:%.*]] = or <16 x i8> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   ret <16 x i8> [[VBSL2_I]]
 uint8x16_t test_vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
   return vbslq_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vbslq_u16
-// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vbslq_u16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %c to <16 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VBSL3_I:%.*]] = and <8 x i16> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <8 x i16> [[VBSL_I]], <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <8 x i16> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <8 x i16> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <8 x i16> [[VBSL5_I]]
 uint16x8_t test_vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) {
   return vbslq_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vbslq_u32
-// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vbslq_u32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %c to <16 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK:   [[VBSL3_I:%.*]] = and <4 x i32> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <4 x i32> [[VBSL_I]], <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <4 x i32> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <4 x i32> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <4 x i32> [[VBSL5_I]]
 uint32x4_t test_vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) {
   return vbslq_u32(a, b, c);
 }
 
-// CHECK-LABEL: test_vbslq_u64
-// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vbslq_u64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %c to <16 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
+// CHECK:   [[VBSL3_I:%.*]] = and <2 x i64> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <2 x i64> [[VBSL_I]], <i64 -1, i64 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <2 x i64> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <2 x i64> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <2 x i64> [[VBSL5_I]]
 uint64x2_t test_vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c) {
   return vbslq_u64(a, b, c);
 }
 
-// CHECK-LABEL: test_vbslq_f32
-// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vbslq_f32(<4 x i32> %a, <4 x float> %b, <4 x float> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %c to <16 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK:   [[VBSL3_I:%.*]] = and <4 x i32> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <4 x i32> [[VBSL_I]], <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <4 x i32> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <4 x i32> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[VBSL5_I]] to <4 x float>
+// CHECK:   ret <4 x float> [[TMP4]]
 float32x4_t test_vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c) {
   return vbslq_f32(a, b, c);
 }
 
-// CHECK-LABEL: test_vbslq_p8
-// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vbslq_p8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK:   [[VBSL_I:%.*]] = and <16 x i8> %a, %b
+// CHECK:   [[TMP0:%.*]] = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[VBSL1_I:%.*]] = and <16 x i8> [[TMP0]], %c
+// CHECK:   [[VBSL2_I:%.*]] = or <16 x i8> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   ret <16 x i8> [[VBSL2_I]]
 poly8x16_t test_vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c) {
   return vbslq_p8(a, b, c);
 }
 
-// CHECK-LABEL: test_vbslq_p16
-// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vbslq_p16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %c to <16 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VBSL3_I:%.*]] = and <8 x i16> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <8 x i16> [[VBSL_I]], <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <8 x i16> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <8 x i16> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <8 x i16> [[VBSL5_I]]
 poly16x8_t test_vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c) {
   return vbslq_p16(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vcage_f32
-// CHECK: vacge.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vcage_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VCAGE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCAGE_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VCAGE_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facge.v2i32.v2f32(<2 x float> [[VCAGE_V_I]], <2 x float> [[VCAGE_V1_I]]) #4
+// CHECK:   ret <2 x i32> [[VCAGE_V2_I]]
 uint32x2_t test_vcage_f32(float32x2_t a, float32x2_t b) {
   return vcage_f32(a, b);
 }
 
-// CHECK-LABEL: test_vcageq_f32
-// CHECK: vacge.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcageq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VCAGEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCAGEQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VCAGEQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facge.v4i32.v4f32(<4 x float> [[VCAGEQ_V_I]], <4 x float> [[VCAGEQ_V1_I]]) #4
+// CHECK:   ret <4 x i32> [[VCAGEQ_V2_I]]
 uint32x4_t test_vcageq_f32(float32x4_t a, float32x4_t b) {
   return vcageq_f32(a, b);
 }
 
 
-// CHECK-LABEL: test_vcagt_f32
-// CHECK: vacgt.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vcagt_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VCAGT_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCAGT_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VCAGT_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facgt.v2i32.v2f32(<2 x float> [[VCAGT_V_I]], <2 x float> [[VCAGT_V1_I]]) #4
+// CHECK:   ret <2 x i32> [[VCAGT_V2_I]]
 uint32x2_t test_vcagt_f32(float32x2_t a, float32x2_t b) {
   return vcagt_f32(a, b);
 }
 
-// CHECK-LABEL: test_vcagtq_f32
-// CHECK: vacgt.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcagtq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VCAGTQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCAGTQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VCAGTQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facgt.v4i32.v4f32(<4 x float> [[VCAGTQ_V_I]], <4 x float> [[VCAGTQ_V1_I]]) #4
+// CHECK:   ret <4 x i32> [[VCAGTQ_V2_I]]
 uint32x4_t test_vcagtq_f32(float32x4_t a, float32x4_t b) {
   return vcagtq_f32(a, b);
 }
 
 
-// CHECK-LABEL: test_vcale_f32
-// CHECK: vacge.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vcale_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VCALE_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VCALE_V1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCALE_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facge.v2i32.v2f32(<2 x float> [[VCALE_V_I]], <2 x float> [[VCALE_V1_I]]) #4
+// CHECK:   ret <2 x i32> [[VCALE_V2_I]]
 uint32x2_t test_vcale_f32(float32x2_t a, float32x2_t b) {
   return vcale_f32(a, b);
 }
 
-// CHECK-LABEL: test_vcaleq_f32
-// CHECK: vacge.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcaleq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VCALEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VCALEQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCALEQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facge.v4i32.v4f32(<4 x float> [[VCALEQ_V_I]], <4 x float> [[VCALEQ_V1_I]]) #4
+// CHECK:   ret <4 x i32> [[VCALEQ_V2_I]]
 uint32x4_t test_vcaleq_f32(float32x4_t a, float32x4_t b) {
   return vcaleq_f32(a, b);
 }
 
 
-// CHECK-LABEL: test_vcalt_f32
-// CHECK: vacgt.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vcalt_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VCALT_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VCALT_V1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCALT_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facgt.v2i32.v2f32(<2 x float> [[VCALT_V_I]], <2 x float> [[VCALT_V1_I]]) #4
+// CHECK:   ret <2 x i32> [[VCALT_V2_I]]
 uint32x2_t test_vcalt_f32(float32x2_t a, float32x2_t b) {
   return vcalt_f32(a, b);
 }
 
-// CHECK-LABEL: test_vcaltq_f32
-// CHECK: vacgt.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcaltq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VCALTQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VCALTQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCALTQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facgt.v4i32.v4f32(<4 x float> [[VCALTQ_V_I]], <4 x float> [[VCALTQ_V1_I]]) #4
+// CHECK:   ret <4 x i32> [[VCALTQ_V2_I]]
 uint32x4_t test_vcaltq_f32(float32x4_t a, float32x4_t b) {
   return vcaltq_f32(a, b);
 }
 
 
-// CHECK-LABEL: test_vceq_s8
-// CHECK: vceq.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vceq_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vceq_s8(int8x8_t a, int8x8_t b) {
   return vceq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vceq_s16
-// CHECK: vceq.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vceq_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vceq_s16(int16x4_t a, int16x4_t b) {
   return vceq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vceq_s32
-// CHECK: vceq.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vceq_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <2 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vceq_s32(int32x2_t a, int32x2_t b) {
   return vceq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vceq_f32
-// CHECK: vceq.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vceq_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp oeq <2 x float> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vceq_f32(float32x2_t a, float32x2_t b) {
   return vceq_f32(a, b);
 }
 
-// CHECK-LABEL: test_vceq_u8
-// CHECK: vceq.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vceq_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vceq_u8(uint8x8_t a, uint8x8_t b) {
   return vceq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vceq_u16
-// CHECK: vceq.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vceq_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vceq_u16(uint16x4_t a, uint16x4_t b) {
   return vceq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vceq_u32
-// CHECK: vceq.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vceq_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <2 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vceq_u32(uint32x2_t a, uint32x2_t b) {
   return vceq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vceq_p8
-// CHECK: vceq.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vceq_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vceq_p8(poly8x8_t a, poly8x8_t b) {
   return vceq_p8(a, b);
 }
 
-// CHECK-LABEL: test_vceqq_s8
-// CHECK: vceq.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vceqq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <16 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vceqq_s8(int8x16_t a, int8x16_t b) {
   return vceqq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vceqq_s16
-// CHECK: vceq.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vceqq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vceqq_s16(int16x8_t a, int16x8_t b) {
   return vceqq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vceqq_s32
-// CHECK: vceq.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vceqq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vceqq_s32(int32x4_t a, int32x4_t b) {
   return vceqq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vceqq_f32
-// CHECK: vceq.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vceqq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp oeq <4 x float> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vceqq_f32(float32x4_t a, float32x4_t b) {
   return vceqq_f32(a, b);
 }
 
-// CHECK-LABEL: test_vceqq_u8
-// CHECK: vceq.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vceqq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <16 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vceqq_u8(uint8x16_t a, uint8x16_t b) {
   return vceqq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vceqq_u16
-// CHECK: vceq.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vceqq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vceqq_u16(uint16x8_t a, uint16x8_t b) {
   return vceqq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vceqq_u32
-// CHECK: vceq.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vceqq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vceqq_u32(uint32x4_t a, uint32x4_t b) {
   return vceqq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vceqq_p8
-// CHECK: vceq.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vceqq_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <16 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vceqq_p8(poly8x16_t a, poly8x16_t b) {
   return vceqq_p8(a, b);
 }
 
 
-// CHECK-LABEL: test_vcge_s8
-// CHECK: vcge.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vcge_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sge <8 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vcge_s8(int8x8_t a, int8x8_t b) {
   return vcge_s8(a, b);
 }
 
-// CHECK-LABEL: test_vcge_s16
-// CHECK: vcge.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vcge_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sge <4 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vcge_s16(int16x4_t a, int16x4_t b) {
   return vcge_s16(a, b);
 }
 
-// CHECK-LABEL: test_vcge_s32
-// CHECK: vcge.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vcge_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sge <2 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcge_s32(int32x2_t a, int32x2_t b) {
   return vcge_s32(a, b);
 }
 
-// CHECK-LABEL: test_vcge_f32
-// CHECK: vcge.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vcge_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp oge <2 x float> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcge_f32(float32x2_t a, float32x2_t b) {
   return vcge_f32(a, b);
 }
 
-// CHECK-LABEL: test_vcge_u8
-// CHECK: vcge.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vcge_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp uge <8 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vcge_u8(uint8x8_t a, uint8x8_t b) {
   return vcge_u8(a, b);
 }
 
-// CHECK-LABEL: test_vcge_u16
-// CHECK: vcge.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vcge_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp uge <4 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vcge_u16(uint16x4_t a, uint16x4_t b) {
   return vcge_u16(a, b);
 }
 
-// CHECK-LABEL: test_vcge_u32
-// CHECK: vcge.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vcge_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp uge <2 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcge_u32(uint32x2_t a, uint32x2_t b) {
   return vcge_u32(a, b);
 }
 
-// CHECK-LABEL: test_vcgeq_s8
-// CHECK: vcge.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vcgeq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sge <16 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcgeq_s8(int8x16_t a, int8x16_t b) {
   return vcgeq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vcgeq_s16
-// CHECK: vcge.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vcgeq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sge <8 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcgeq_s16(int16x8_t a, int16x8_t b) {
   return vcgeq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vcgeq_s32
-// CHECK: vcge.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcgeq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sge <4 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcgeq_s32(int32x4_t a, int32x4_t b) {
   return vcgeq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vcgeq_f32
-// CHECK: vcge.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcgeq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp oge <4 x float> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcgeq_f32(float32x4_t a, float32x4_t b) {
   return vcgeq_f32(a, b);
 }
 
-// CHECK-LABEL: test_vcgeq_u8
-// CHECK: vcge.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vcgeq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp uge <16 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcgeq_u8(uint8x16_t a, uint8x16_t b) {
   return vcgeq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vcgeq_u16
-// CHECK: vcge.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vcgeq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp uge <8 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcgeq_u16(uint16x8_t a, uint16x8_t b) {
   return vcgeq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vcgeq_u32
-// CHECK: vcge.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcgeq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp uge <4 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcgeq_u32(uint32x4_t a, uint32x4_t b) {
   return vcgeq_u32(a, b);
 }
 
 
-// CHECK-LABEL: test_vcgt_s8
-// CHECK: vcgt.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vcgt_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sgt <8 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vcgt_s8(int8x8_t a, int8x8_t b) {
   return vcgt_s8(a, b);
 }
 
-// CHECK-LABEL: test_vcgt_s16
-// CHECK: vcgt.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vcgt_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sgt <4 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vcgt_s16(int16x4_t a, int16x4_t b) {
   return vcgt_s16(a, b);
 }
 
-// CHECK-LABEL: test_vcgt_s32
-// CHECK: vcgt.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vcgt_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sgt <2 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcgt_s32(int32x2_t a, int32x2_t b) {
   return vcgt_s32(a, b);
 }
 
-// CHECK-LABEL: test_vcgt_f32
-// CHECK: vcgt.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vcgt_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp ogt <2 x float> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcgt_f32(float32x2_t a, float32x2_t b) {
   return vcgt_f32(a, b);
 }
 
-// CHECK-LABEL: test_vcgt_u8
-// CHECK: vcgt.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vcgt_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ugt <8 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vcgt_u8(uint8x8_t a, uint8x8_t b) {
   return vcgt_u8(a, b);
 }
 
-// CHECK-LABEL: test_vcgt_u16
-// CHECK: vcgt.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vcgt_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ugt <4 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vcgt_u16(uint16x4_t a, uint16x4_t b) {
   return vcgt_u16(a, b);
 }
 
-// CHECK-LABEL: test_vcgt_u32
-// CHECK: vcgt.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vcgt_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ugt <2 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcgt_u32(uint32x2_t a, uint32x2_t b) {
   return vcgt_u32(a, b);
 }
 
-// CHECK-LABEL: test_vcgtq_s8
-// CHECK: vcgt.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vcgtq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sgt <16 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcgtq_s8(int8x16_t a, int8x16_t b) {
   return vcgtq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vcgtq_s16
-// CHECK: vcgt.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vcgtq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sgt <8 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcgtq_s16(int16x8_t a, int16x8_t b) {
   return vcgtq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vcgtq_s32
-// CHECK: vcgt.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcgtq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sgt <4 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcgtq_s32(int32x4_t a, int32x4_t b) {
   return vcgtq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vcgtq_f32
-// CHECK: vcgt.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcgtq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp ogt <4 x float> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcgtq_f32(float32x4_t a, float32x4_t b) {
   return vcgtq_f32(a, b);
 }
 
-// CHECK-LABEL: test_vcgtq_u8
-// CHECK: vcgt.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vcgtq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ugt <16 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcgtq_u8(uint8x16_t a, uint8x16_t b) {
   return vcgtq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vcgtq_u16
-// CHECK: vcgt.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vcgtq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ugt <8 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcgtq_u16(uint16x8_t a, uint16x8_t b) {
   return vcgtq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vcgtq_u32
-// CHECK: vcgt.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcgtq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ugt <4 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcgtq_u32(uint32x4_t a, uint32x4_t b) {
   return vcgtq_u32(a, b);
 }
 
 
-// CHECK-LABEL: test_vcle_s8
-// CHECK: vcge.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vcle_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sle <8 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vcle_s8(int8x8_t a, int8x8_t b) {
   return vcle_s8(a, b);
 }
 
-// CHECK-LABEL: test_vcle_s16
-// CHECK: vcge.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vcle_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sle <4 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vcle_s16(int16x4_t a, int16x4_t b) {
   return vcle_s16(a, b);
 }
 
-// CHECK-LABEL: test_vcle_s32
-// CHECK: vcge.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vcle_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sle <2 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcle_s32(int32x2_t a, int32x2_t b) {
   return vcle_s32(a, b);
 }
 
-// CHECK-LABEL: test_vcle_f32
-// CHECK: vcge.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vcle_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp ole <2 x float> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcle_f32(float32x2_t a, float32x2_t b) {
   return vcle_f32(a, b);
 }
 
-// CHECK-LABEL: test_vcle_u8
-// CHECK: vcge.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vcle_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ule <8 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vcle_u8(uint8x8_t a, uint8x8_t b) {
   return vcle_u8(a, b);
 }
 
-// CHECK-LABEL: test_vcle_u16
-// CHECK: vcge.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vcle_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ule <4 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vcle_u16(uint16x4_t a, uint16x4_t b) {
   return vcle_u16(a, b);
 }
 
-// CHECK-LABEL: test_vcle_u32
-// CHECK: vcge.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vcle_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ule <2 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcle_u32(uint32x2_t a, uint32x2_t b) {
   return vcle_u32(a, b);
 }
 
-// CHECK-LABEL: test_vcleq_s8
-// CHECK: vcge.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vcleq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sle <16 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcleq_s8(int8x16_t a, int8x16_t b) {
   return vcleq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vcleq_s16
-// CHECK: vcge.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vcleq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sle <8 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcleq_s16(int16x8_t a, int16x8_t b) {
   return vcleq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vcleq_s32
-// CHECK: vcge.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcleq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sle <4 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcleq_s32(int32x4_t a, int32x4_t b) {
   return vcleq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vcleq_f32
-// CHECK: vcge.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcleq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp ole <4 x float> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcleq_f32(float32x4_t a, float32x4_t b) {
   return vcleq_f32(a, b);
 }
 
-// CHECK-LABEL: test_vcleq_u8
-// CHECK: vcge.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vcleq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ule <16 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcleq_u8(uint8x16_t a, uint8x16_t b) {
   return vcleq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vcleq_u16
-// CHECK: vcge.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vcleq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ule <8 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcleq_u16(uint16x8_t a, uint16x8_t b) {
   return vcleq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vcleq_u32
-// CHECK: vcge.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcleq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ule <4 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcleq_u32(uint32x4_t a, uint32x4_t b) {
   return vcleq_u32(a, b);
 }
 
 
-// CHECK-LABEL: test_vcls_s8
-// CHECK: vcls.s8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vcls_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VCLS_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.cls.v8i8(<8 x i8> %a) #4
+// CHECK:   ret <8 x i8> [[VCLS_V_I]]
 int8x8_t test_vcls_s8(int8x8_t a) {
   return vcls_s8(a);
 }
 
-// CHECK-LABEL: test_vcls_s16
-// CHECK: vcls.s16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vcls_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VCLS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VCLS_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.cls.v4i16(<4 x i16> [[VCLS_V_I]]) #4
+// CHECK:   [[VCLS_V2_I:%.*]] = bitcast <4 x i16> [[VCLS_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLS_V2_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP1]]
 int16x4_t test_vcls_s16(int16x4_t a) {
   return vcls_s16(a);
 }
 
-// CHECK-LABEL: test_vcls_s32
-// CHECK: vcls.s32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vcls_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VCLS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VCLS_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.cls.v2i32(<2 x i32> [[VCLS_V_I]]) #4
+// CHECK:   [[VCLS_V2_I:%.*]] = bitcast <2 x i32> [[VCLS_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLS_V2_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP1]]
 int32x2_t test_vcls_s32(int32x2_t a) {
   return vcls_s32(a);
 }
 
-// CHECK-LABEL: test_vclsq_s8
-// CHECK: vcls.s8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vclsq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VCLSQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.cls.v16i8(<16 x i8> %a) #4
+// CHECK:   ret <16 x i8> [[VCLSQ_V_I]]
 int8x16_t test_vclsq_s8(int8x16_t a) {
   return vclsq_s8(a);
 }
 
-// CHECK-LABEL: test_vclsq_s16
-// CHECK: vcls.s16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vclsq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VCLSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VCLSQ_V1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.cls.v8i16(<8 x i16> [[VCLSQ_V_I]]) #4
+// CHECK:   [[VCLSQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLSQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLSQ_V2_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP1]]
 int16x8_t test_vclsq_s16(int16x8_t a) {
   return vclsq_s16(a);
 }
 
-// CHECK-LABEL: test_vclsq_s32
-// CHECK: vcls.s32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vclsq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VCLSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VCLSQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.cls.v4i32(<4 x i32> [[VCLSQ_V_I]]) #4
+// CHECK:   [[VCLSQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLSQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLSQ_V2_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP1]]
 int32x4_t test_vclsq_s32(int32x4_t a) {
   return vclsq_s32(a);
 }
 
 
-// CHECK-LABEL: test_vclt_s8
-// CHECK: vcgt.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vclt_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp slt <8 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vclt_s8(int8x8_t a, int8x8_t b) {
   return vclt_s8(a, b);
 }
 
-// CHECK-LABEL: test_vclt_s16
-// CHECK: vcgt.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vclt_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp slt <4 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vclt_s16(int16x4_t a, int16x4_t b) {
   return vclt_s16(a, b);
 }
 
-// CHECK-LABEL: test_vclt_s32
-// CHECK: vcgt.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vclt_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp slt <2 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vclt_s32(int32x2_t a, int32x2_t b) {
   return vclt_s32(a, b);
 }
 
-// CHECK-LABEL: test_vclt_f32
-// CHECK: vcgt.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vclt_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp olt <2 x float> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vclt_f32(float32x2_t a, float32x2_t b) {
   return vclt_f32(a, b);
 }
 
-// CHECK-LABEL: test_vclt_u8
-// CHECK: vcgt.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vclt_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ult <8 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vclt_u8(uint8x8_t a, uint8x8_t b) {
   return vclt_u8(a, b);
 }
 
-// CHECK-LABEL: test_vclt_u16
-// CHECK: vcgt.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vclt_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ult <4 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vclt_u16(uint16x4_t a, uint16x4_t b) {
   return vclt_u16(a, b);
 }
 
-// CHECK-LABEL: test_vclt_u32
-// CHECK: vcgt.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vclt_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ult <2 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vclt_u32(uint32x2_t a, uint32x2_t b) {
   return vclt_u32(a, b);
 }
 
-// CHECK-LABEL: test_vcltq_s8
-// CHECK: vcgt.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vcltq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp slt <16 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcltq_s8(int8x16_t a, int8x16_t b) {
   return vcltq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vcltq_s16
-// CHECK: vcgt.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vcltq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp slt <8 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcltq_s16(int16x8_t a, int16x8_t b) {
   return vcltq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vcltq_s32
-// CHECK: vcgt.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcltq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp slt <4 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcltq_s32(int32x4_t a, int32x4_t b) {
   return vcltq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vcltq_f32
-// CHECK: vcgt.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcltq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp olt <4 x float> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcltq_f32(float32x4_t a, float32x4_t b) {
   return vcltq_f32(a, b);
 }
 
-// CHECK-LABEL: test_vcltq_u8
-// CHECK: vcgt.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vcltq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ult <16 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcltq_u8(uint8x16_t a, uint8x16_t b) {
   return vcltq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vcltq_u16
-// CHECK: vcgt.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vcltq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ult <8 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcltq_u16(uint16x8_t a, uint16x8_t b) {
   return vcltq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vcltq_u32
-// CHECK: vcgt.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcltq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ult <4 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcltq_u32(uint32x4_t a, uint32x4_t b) {
   return vcltq_u32(a, b);
 }
 
 
-// CHECK-LABEL: test_vclz_s8
-// CHECK: vclz.i8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vclz_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) #4
+// CHECK:   ret <8 x i8> [[VCLZ_V_I]]
 int8x8_t test_vclz_s8(int8x8_t a) {
   return vclz_s8(a);
 }
 
-// CHECK-LABEL: test_vclz_s16
-// CHECK: vclz.i16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vclz_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[VCLZ_V_I]], i1 false) #4
+// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP1]]
 int16x4_t test_vclz_s16(int16x4_t a) {
   return vclz_s16(a);
 }
 
-// CHECK-LABEL: test_vclz_s32
-// CHECK: vclz.i32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vclz_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[VCLZ_V_I]], i1 false) #4
+// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP1]]
 int32x2_t test_vclz_s32(int32x2_t a) {
   return vclz_s32(a);
 }
 
-// CHECK-LABEL: test_vclz_u8
-// CHECK: vclz.i8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vclz_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) #4
+// CHECK:   ret <8 x i8> [[VCLZ_V_I]]
 uint8x8_t test_vclz_u8(uint8x8_t a) {
   return vclz_u8(a);
 }
 
-// CHECK-LABEL: test_vclz_u16
-// CHECK: vclz.i16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vclz_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[VCLZ_V_I]], i1 false) #4
+// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP1]]
 uint16x4_t test_vclz_u16(uint16x4_t a) {
   return vclz_u16(a);
 }
 
-// CHECK-LABEL: test_vclz_u32
-// CHECK: vclz.i32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vclz_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[VCLZ_V_I]], i1 false) #4
+// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP1]]
 uint32x2_t test_vclz_u32(uint32x2_t a) {
   return vclz_u32(a);
 }
 
-// CHECK-LABEL: test_vclzq_s8
-// CHECK: vclz.i8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vclzq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VCLZQ_V_I:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) #4
+// CHECK:   ret <16 x i8> [[VCLZQ_V_I]]
 int8x16_t test_vclzq_s8(int8x16_t a) {
   return vclzq_s8(a);
 }
 
-// CHECK-LABEL: test_vclzq_s16
-// CHECK: vclz.i16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vclzq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[VCLZQ_V_I]], i1 false) #4
+// CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLZQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP1]]
 int16x8_t test_vclzq_s16(int16x8_t a) {
   return vclzq_s16(a);
 }
 
-// CHECK-LABEL: test_vclzq_s32
-// CHECK: vclz.i32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vclzq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[VCLZQ_V_I]], i1 false) #4
+// CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLZQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP1]]
 int32x4_t test_vclzq_s32(int32x4_t a) {
   return vclzq_s32(a);
 }
 
-// CHECK-LABEL: test_vclzq_u8
-// CHECK: vclz.i8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vclzq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VCLZQ_V_I:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) #4
+// CHECK:   ret <16 x i8> [[VCLZQ_V_I]]
 uint8x16_t test_vclzq_u8(uint8x16_t a) {
   return vclzq_u8(a);
 }
 
-// CHECK-LABEL: test_vclzq_u16
-// CHECK: vclz.i16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vclzq_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[VCLZQ_V_I]], i1 false) #4
+// CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLZQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP1]]
 uint16x8_t test_vclzq_u16(uint16x8_t a) {
   return vclzq_u16(a);
 }
 
-// CHECK-LABEL: test_vclzq_u32
-// CHECK: vclz.i32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vclzq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[VCLZQ_V_I]], i1 false) #4
+// CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLZQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP1]]
 uint32x4_t test_vclzq_u32(uint32x4_t a) {
   return vclzq_u32(a);
 }
 
 
-// CHECK-LABEL: test_vcnt_u8
-// CHECK: vcnt.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vcnt_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) #4
+// CHECK:   ret <8 x i8> [[VCNT_V_I]]
 uint8x8_t test_vcnt_u8(uint8x8_t a) {
   return vcnt_u8(a);
 }
 
-// CHECK-LABEL: test_vcnt_s8
-// CHECK: vcnt.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vcnt_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) #4
+// CHECK:   ret <8 x i8> [[VCNT_V_I]]
 int8x8_t test_vcnt_s8(int8x8_t a) {
   return vcnt_s8(a);
 }
 
-// CHECK-LABEL: test_vcnt_p8
-// CHECK: vcnt.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vcnt_p8(<8 x i8> %a) #0 {
+// CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) #4
+// CHECK:   ret <8 x i8> [[VCNT_V_I]]
 poly8x8_t test_vcnt_p8(poly8x8_t a) {
   return vcnt_p8(a);
 }
 
-// CHECK-LABEL: test_vcntq_u8
-// CHECK: vcnt.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vcntq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) #4
+// CHECK:   ret <16 x i8> [[VCNTQ_V_I]]
 uint8x16_t test_vcntq_u8(uint8x16_t a) {
   return vcntq_u8(a);
 }
 
-// CHECK-LABEL: test_vcntq_s8
-// CHECK: vcnt.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vcntq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) #4
+// CHECK:   ret <16 x i8> [[VCNTQ_V_I]]
 int8x16_t test_vcntq_s8(int8x16_t a) {
   return vcntq_s8(a);
 }
 
-// CHECK-LABEL: test_vcntq_p8
-// CHECK: vcnt.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vcntq_p8(<16 x i8> %a) #0 {
+// CHECK:   [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) #4
+// CHECK:   ret <16 x i8> [[VCNTQ_V_I]]
 poly8x16_t test_vcntq_p8(poly8x16_t a) {
   return vcntq_p8(a);
 }
 
 
-// CHECK-LABEL: test_vcombine_s8
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vcombine_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vcombine_s8(int8x8_t a, int8x8_t b) {
   return vcombine_s8(a, b);
 }
 
-// CHECK-LABEL: test_vcombine_s16
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vcombine_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vcombine_s16(int16x4_t a, int16x4_t b) {
   return vcombine_s16(a, b);
 }
 
-// CHECK-LABEL: test_vcombine_s32
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcombine_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vcombine_s32(int32x2_t a, int32x2_t b) {
   return vcombine_s32(a, b);
 }
 
-// CHECK-LABEL: test_vcombine_s64
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vcombine_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %b, <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 int64x2_t test_vcombine_s64(int64x1_t a, int64x1_t b) {
   return vcombine_s64(a, b);
 }
 
-// CHECK-LABEL: test_vcombine_f16
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
+// CHECK-LABEL: define <8 x half> @test_vcombine_f16(<4 x half> %a, <4 x half> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x half> [[SHUFFLE_I]]
 float16x8_t test_vcombine_f16(float16x4_t a, float16x4_t b) {
   return vcombine_f16(a, b);
 }
 
-// CHECK-LABEL: test_vcombine_f32
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vcombine_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x float> [[SHUFFLE_I]]
 float32x4_t test_vcombine_f32(float32x2_t a, float32x2_t b) {
   return vcombine_f32(a, b);
 }
 
-// CHECK-LABEL: test_vcombine_u8
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vcombine_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vcombine_u8(uint8x8_t a, uint8x8_t b) {
   return vcombine_u8(a, b);
 }
 
-// CHECK-LABEL: test_vcombine_u16
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vcombine_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vcombine_u16(uint16x4_t a, uint16x4_t b) {
   return vcombine_u16(a, b);
 }
 
-// CHECK-LABEL: test_vcombine_u32
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcombine_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vcombine_u32(uint32x2_t a, uint32x2_t b) {
   return vcombine_u32(a, b);
 }
 
-// CHECK-LABEL: test_vcombine_u64
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vcombine_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %b, <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 uint64x2_t test_vcombine_u64(uint64x1_t a, uint64x1_t b) {
   return vcombine_u64(a, b);
 }
 
-// CHECK-LABEL: test_vcombine_p8
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vcombine_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vcombine_p8(poly8x8_t a, poly8x8_t b) {
   return vcombine_p8(a, b);
 }
 
-// CHECK-LABEL: test_vcombine_p16
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vcombine_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vcombine_p16(poly16x4_t a, poly16x4_t b) {
   return vcombine_p16(a, b);
 }
 
 
-// CHECK-LABEL: test_vcreate_s8
-// CHECK: vmov [[REG:d[0-9]+]], r0, r1
-// CHECK: vclz.i8 d{{[0-9]+}}, [[REG]]
+// CHECK-LABEL: define <8 x i8> @test_vcreate_s8(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <8 x i8>
+// CHECK:   [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> [[TMP0]], i1 false) #4
+// CHECK:   ret <8 x i8> [[VCLZ_V_I]]
 int8x8_t test_vcreate_s8(uint64_t a) {
   return vclz_s8(vcreate_s8(a));
 }
 
-// CHECK-LABEL: test_vcreate_s16
-// CHECK: vmov [[REG:d[0-9]+]], r0, r1
-// CHECK: vclz.i16 d{{[0-9]+}}, [[REG]]
+// CHECK-LABEL: define <4 x i16> @test_vcreate_s16(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <4 x i16>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[VCLZ_V_I]], i1 false) #4
+// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vcreate_s16(uint64_t a) {
   return vclz_s16(vcreate_s16(a));
 }
 
-// CHECK-LABEL: test_vcreate_s32
-// CHECK: vmov [[REG:d[0-9]+]], r0, r1
-// CHECK: vclz.i32 d{{[0-9]+}}, [[REG]]
+// CHECK-LABEL: define <2 x i32> @test_vcreate_s32(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <2 x i32>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
+// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[VCLZ_V_I]], i1 false) #4
+// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vcreate_s32(uint64_t a) {
   return vclz_s32(vcreate_s32(a));
 }
 
-// CHECK-LABEL: test_vcreate_f16
+// CHECK-LABEL: define <4 x half> @test_vcreate_f16(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vcreate_f16(uint64_t a) {
   return vcreate_f16(a);
 }
 
-// CHECK-LABEL: test_vcreate_f32
+// CHECK-LABEL: define <2 x float> @test_vcreate_f32(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vcreate_f32(uint64_t a) {
   return vcreate_f32(a);
 }
 
-// CHECK-LABEL: test_vcreate_u8
-// CHECK: vmov [[REG:d[0-9]+]], r0, r1
-// CHECK: vclz.i8 d{{[0-9]+}}, [[REG]]
+// CHECK-LABEL: define <8 x i8> @test_vcreate_u8(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <8 x i8>
+// CHECK:   [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> [[TMP0]], i1 false) #4
+// CHECK:   ret <8 x i8> [[VCLZ_V_I]]
 uint8x8_t test_vcreate_u8(uint64_t a) {
   return vclz_s8(vcreate_u8(a));
 }
 
-// CHECK-LABEL: test_vcreate_u16
-// CHECK: vmov [[REG:d[0-9]+]], r0, r1
-// CHECK: vclz.i16 d{{[0-9]+}}, [[REG]]
+// CHECK-LABEL: define <4 x i16> @test_vcreate_u16(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <4 x i16>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[VCLZ_V_I]], i1 false) #4
+// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vcreate_u16(uint64_t a) {
   return vclz_s16(vcreate_u16(a));
 }
 
-// CHECK-LABEL: test_vcreate_u32
-// CHECK: vmov [[REG:d[0-9]+]], r0, r1
-// CHECK: vclz.i32 d{{[0-9]+}}, [[REG]]
+// CHECK-LABEL: define <2 x i32> @test_vcreate_u32(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <2 x i32>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
+// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[VCLZ_V_I]], i1 false) #4
+// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vcreate_u32(uint64_t a) {
   return vclz_s32(vcreate_u32(a));
 }
@@ -1614,10145 +2449,21932 @@ uint32x2_t test_vcreate_u32(uint64_t a)
 // We have two ways of lowering that.  Either with one 'vmov d, r, r' or
 // with two 'vmov d[],r'.  LLVM does the latter. We may want to be less
 // strict about the matching pattern if it starts causing problem.
-// CHECK-LABEL: test_vcreate_u64
-// CHECK: vmov.32 [[REG:d[0-9]+]][0], r0
-// CHECK: vmov.32 [[REG]][1], r1
+// CHECK-LABEL: define <1 x i64> @test_vcreate_u64(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <1 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[TMP0]], [[TMP0]]
+// CHECK:   ret <1 x i64> [[ADD_I]]
 uint64x1_t test_vcreate_u64(uint64_t a) {
   uint64x1_t tmp = vcreate_u64(a);
   return vadd_u64(tmp, tmp);
 
 }
 
-// CHECK-LABEL: test_vcreate_p8
-// CHECK: vmov [[REG:d[0-9]+]], r0, r1
-// CHECK: vcnt.8 d{{[0-9]+}}, [[REG]]
+// CHECK-LABEL: define <8 x i8> @test_vcreate_p8(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <8 x i8>
+// CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> [[TMP0]]) #4
+// CHECK:   ret <8 x i8> [[VCNT_V_I]]
 poly8x8_t test_vcreate_p8(uint64_t a) {
   return vcnt_p8(vcreate_p8(a));
 }
 
-// CHECK-LABEL: test_vcreate_p16
-// CHECK: vmov [[REG:d[0-9]+]], r0, r1
+// CHECK-LABEL: define <4 x i16> @test_vcreate_p16(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <4 x i16>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
+// CHECK:   [[VBSL3_I:%.*]] = and <4 x i16> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP4:%.*]] = xor <4 x i16> [[VBSL_I]], <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <4 x i16> [[TMP4]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <4 x i16> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <4 x i16> [[VBSL5_I]]
 poly16x4_t test_vcreate_p16(uint64_t a) {
   poly16x4_t tmp = vcreate_p16(a);
   return vbsl_p16(tmp, tmp, tmp);
 }
 
-// CHECK-LABEL: test_vcreate_s64
-// CHECK: vmov.32 [[REG:d[0-9]+]][0], r0
-// CHECK: vmov.32 [[REG]][1], r1
+// CHECK-LABEL: define <1 x i64> @test_vcreate_s64(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <1 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[TMP0]], [[TMP0]]
+// CHECK:   ret <1 x i64> [[ADD_I]]
 int64x1_t test_vcreate_s64(uint64_t a) {
   int64x1_t tmp = vcreate_s64(a);
   return vadd_s64(tmp, tmp);
 }
 
 
-// CHECK-LABEL: test_vcvt_f16_f32
-// CHECK: vcvt.f16.f32 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x half> @test_vcvt_f16_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVT_F16_F32_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVT_F16_F321_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float> [[VCVT_F16_F32_I]]) #4
+// CHECK:   [[VCVT_F16_F322_I:%.*]] = bitcast <4 x i16> [[VCVT_F16_F321_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCVT_F16_F322_I]] to <4 x half>
+// CHECK:   ret <4 x half> [[TMP1]]
 float16x4_t test_vcvt_f16_f32(float32x4_t a) {
   return vcvt_f16_f32(a);
 }
 
 
-// CHECK-LABEL: test_vcvt_f32_s32
-// CHECK: vcvt.f32.s32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vcvt_f32_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VCVT_I:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float>
+// CHECK:   ret <2 x float> [[VCVT_I]]
 float32x2_t test_vcvt_f32_s32(int32x2_t a) {
   return vcvt_f32_s32(a);
 }
 
-// CHECK-LABEL: test_vcvt_f32_u32
-// CHECK: vcvt.f32.u32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vcvt_f32_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VCVT_I:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x float>
+// CHECK:   ret <2 x float> [[VCVT_I]]
 float32x2_t test_vcvt_f32_u32(uint32x2_t a) {
   return vcvt_f32_u32(a);
 }
 
-// CHECK-LABEL: test_vcvtq_f32_s32
-// CHECK: vcvt.f32.s32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vcvtq_f32_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VCVT_I:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
+// CHECK:   ret <4 x float> [[VCVT_I]]
 float32x4_t test_vcvtq_f32_s32(int32x4_t a) {
   return vcvtq_f32_s32(a);
 }
 
-// CHECK-LABEL: test_vcvtq_f32_u32
-// CHECK: vcvt.f32.u32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vcvtq_f32_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VCVT_I:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x float>
+// CHECK:   ret <4 x float> [[VCVT_I]]
 float32x4_t test_vcvtq_f32_u32(uint32x4_t a) {
   return vcvtq_f32_u32(a);
 }
 
 
-// CHECK-LABEL: test_vcvt_f32_f16
-// CHECK: vcvt.f32.f16
+// CHECK-LABEL: define <4 x float> @test_vcvt_f32_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
+// CHECK:   [[VCVT_F32_F16_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VCVT_F32_F161_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvthf2fp(<4 x i16> [[VCVT_F32_F16_I]]) #4
+// CHECK:   [[VCVT_F32_F162_I:%.*]] = bitcast <4 x float> [[VCVT_F32_F161_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCVT_F32_F162_I]] to <4 x float>
+// CHECK:   ret <4 x float> [[TMP1]]
 float32x4_t test_vcvt_f32_f16(float16x4_t a) {
   return vcvt_f32_f16(a);
 }
 
 
-// CHECK-LABEL: test_vcvt_n_f32_s32
-// CHECK: vcvt.f32.s32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vcvt_n_f32_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VCVT_N1:%.*]] = call <2 x float> @llvm.aarch64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> [[VCVT_N]], i32 1)
+// CHECK:   ret <2 x float> [[VCVT_N]]1
 float32x2_t test_vcvt_n_f32_s32(int32x2_t a) {
   return vcvt_n_f32_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vcvt_n_f32_u32
-// CHECK: vcvt.f32.u32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vcvt_n_f32_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VCVT_N1:%.*]] = call <2 x float> @llvm.aarch64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> [[VCVT_N]], i32 1)
+// CHECK:   ret <2 x float> [[VCVT_N]]1
 float32x2_t test_vcvt_n_f32_u32(uint32x2_t a) {
   return vcvt_n_f32_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vcvtq_n_f32_s32
-// CHECK: vcvt.f32.s32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vcvtq_n_f32_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VCVT_N1:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> [[VCVT_N]], i32 3)
+// CHECK:   ret <4 x float> [[VCVT_N]]1
 float32x4_t test_vcvtq_n_f32_s32(int32x4_t a) {
   return vcvtq_n_f32_s32(a, 3);
 }
 
-// CHECK-LABEL: test_vcvtq_n_f32_u32
-// CHECK: vcvt.f32.u32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vcvtq_n_f32_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VCVT_N1:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> [[VCVT_N]], i32 3)
+// CHECK:   ret <4 x float> [[VCVT_N]]1
 float32x4_t test_vcvtq_n_f32_u32(uint32x4_t a) {
   return vcvtq_n_f32_u32(a, 3);
 }
 
 
-// CHECK-LABEL: test_vcvt_n_s32_f32
-// CHECK: vcvt.s32.f32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vcvt_n_s32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVT_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> [[VCVT_N]], i32 1)
+// CHECK:   ret <2 x i32> [[VCVT_N]]1
 int32x2_t test_vcvt_n_s32_f32(float32x2_t a) {
   return vcvt_n_s32_f32(a, 1);
 }
 
-// CHECK-LABEL: test_vcvtq_n_s32_f32
-// CHECK: vcvt.s32.f32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcvtq_n_s32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVT_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> [[VCVT_N]], i32 3)
+// CHECK:   ret <4 x i32> [[VCVT_N]]1
 int32x4_t test_vcvtq_n_s32_f32(float32x4_t a) {
   return vcvtq_n_s32_f32(a, 3);
 }
 
 
-// CHECK-LABEL: test_vcvt_n_u32_f32
-// CHECK: vcvt.u32.f32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vcvt_n_u32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVT_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> [[VCVT_N]], i32 1)
+// CHECK:   ret <2 x i32> [[VCVT_N]]1
 uint32x2_t test_vcvt_n_u32_f32(float32x2_t a) {
   return vcvt_n_u32_f32(a, 1);
 }
 
-// CHECK-LABEL: test_vcvtq_n_u32_f32
-// CHECK: vcvt.u32.f32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcvtq_n_u32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVT_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> [[VCVT_N]], i32 3)
+// CHECK:   ret <4 x i32> [[VCVT_N]]1
 uint32x4_t test_vcvtq_n_u32_f32(float32x4_t a) {
   return vcvtq_n_u32_f32(a, 3);
 }
 
 
-// CHECK-LABEL: test_vcvt_s32_f32
-// CHECK: vcvt.s32.f32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vcvt_s32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP2:%.*]] = fptosi <2 x float> [[TMP1]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vcvt_s32_f32(float32x2_t a) {
   return vcvt_s32_f32(a);
 }
 
-// CHECK-LABEL: test_vcvtq_s32_f32
-// CHECK: vcvt.s32.f32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcvtq_s32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vcvtq_s32_f32(float32x4_t a) {
   return vcvtq_s32_f32(a);
 }
 
 
-// CHECK-LABEL: test_vcvt_u32_f32
-// CHECK: vcvt.u32.f32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vcvt_u32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP2:%.*]] = fptoui <2 x float> [[TMP1]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vcvt_u32_f32(float32x2_t a) {
   return vcvt_u32_f32(a);
 }
 
-// CHECK-LABEL: test_vcvtq_u32_f32
-// CHECK: vcvt.u32.f32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcvtq_u32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP2:%.*]] = fptoui <4 x float> [[TMP1]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vcvtq_u32_f32(float32x4_t a) {
   return vcvtq_u32_f32(a);
 }
 
 
-// CHECK-LABEL: test_vdup_lane_u8
-// CHECK: vdup.8 d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <8 x i8> @test_vdup_lane_u8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   ret <8 x i8> [[SHUFFLE]]
 uint8x8_t test_vdup_lane_u8(uint8x8_t a) {
   return vdup_lane_u8(a, 7);
 }
 
-// CHECK-LABEL: test_vdup_lane_u16
-// CHECK: vdup.16 d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i16> @test_vdup_lane_u16(<4 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   ret <4 x i16> [[SHUFFLE]]
 uint16x4_t test_vdup_lane_u16(uint16x4_t a) {
   return vdup_lane_u16(a, 3);
 }
 
-// CHECK-LABEL: test_vdup_lane_u32
-// CHECK: vdup.32 d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i32> @test_vdup_lane_u32(<2 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 1, i32 1>
+// CHECK:   ret <2 x i32> [[SHUFFLE]]
 uint32x2_t test_vdup_lane_u32(uint32x2_t a) {
   return vdup_lane_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vdup_lane_s8
-// CHECK: vdup.8 d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <8 x i8> @test_vdup_lane_s8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   ret <8 x i8> [[SHUFFLE]]
 int8x8_t test_vdup_lane_s8(int8x8_t a) {
   return vdup_lane_s8(a, 7);
 }
 
-// CHECK-LABEL: test_vdup_lane_s16
-// CHECK: vdup.16 d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i16> @test_vdup_lane_s16(<4 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   ret <4 x i16> [[SHUFFLE]]
 int16x4_t test_vdup_lane_s16(int16x4_t a) {
   return vdup_lane_s16(a, 3);
 }
 
-// CHECK-LABEL: test_vdup_lane_s32
-// CHECK: vdup.32 d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i32> @test_vdup_lane_s32(<2 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 1, i32 1>
+// CHECK:   ret <2 x i32> [[SHUFFLE]]
 int32x2_t test_vdup_lane_s32(int32x2_t a) {
   return vdup_lane_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vdup_lane_p8
-// CHECK: vdup.8 d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <8 x i8> @test_vdup_lane_p8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   ret <8 x i8> [[SHUFFLE]]
 poly8x8_t test_vdup_lane_p8(poly8x8_t a) {
   return vdup_lane_p8(a, 7);
 }
 
-// CHECK-LABEL: test_vdup_lane_p16
-// CHECK: vdup.16 d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i16> @test_vdup_lane_p16(<4 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   ret <4 x i16> [[SHUFFLE]]
 poly16x4_t test_vdup_lane_p16(poly16x4_t a) {
   return vdup_lane_p16(a, 3);
 }
 
-// CHECK-LABEL: test_vdup_lane_f32
-// CHECK: vdup.32 d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x float> @test_vdup_lane_f32(<2 x float> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %a, <2 x float> %a, <2 x i32> <i32 1, i32 1>
+// CHECK:   ret <2 x float> [[SHUFFLE]]
 float32x2_t test_vdup_lane_f32(float32x2_t a) {
   return vdup_lane_f32(a, 1);
 }
 
-// CHECK-LABEL: test_vdupq_lane_u8
-// CHECK: vdup.8 q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <16 x i8> @test_vdupq_lane_u8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   ret <16 x i8> [[SHUFFLE]]
 uint8x16_t test_vdupq_lane_u8(uint8x8_t a) {
   return vdupq_lane_u8(a, 7);
 }
 
-// CHECK-LABEL: test_vdupq_lane_u16
-// CHECK: vdup.16 q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <8 x i16> @test_vdupq_lane_u16(<4 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   ret <8 x i16> [[SHUFFLE]]
 uint16x8_t test_vdupq_lane_u16(uint16x4_t a) {
   return vdupq_lane_u16(a, 3);
 }
 
-// CHECK-LABEL: test_vdupq_lane_u32
-// CHECK: vdup.32 q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vdupq_lane_u32(<2 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   ret <4 x i32> [[SHUFFLE]]
 uint32x4_t test_vdupq_lane_u32(uint32x2_t a) {
   return vdupq_lane_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vdupq_lane_s8
-// CHECK: vdup.8 q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <16 x i8> @test_vdupq_lane_s8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   ret <16 x i8> [[SHUFFLE]]
 int8x16_t test_vdupq_lane_s8(int8x8_t a) {
   return vdupq_lane_s8(a, 7);
 }
 
-// CHECK-LABEL: test_vdupq_lane_s16
-// CHECK: vdup.16 q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <8 x i16> @test_vdupq_lane_s16(<4 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   ret <8 x i16> [[SHUFFLE]]
 int16x8_t test_vdupq_lane_s16(int16x4_t a) {
   return vdupq_lane_s16(a, 3);
 }
 
-// CHECK-LABEL: test_vdupq_lane_s32
-// CHECK: vdup.32 q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vdupq_lane_s32(<2 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   ret <4 x i32> [[SHUFFLE]]
 int32x4_t test_vdupq_lane_s32(int32x2_t a) {
   return vdupq_lane_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vdupq_lane_p8
-// CHECK: vdup.8 q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <16 x i8> @test_vdupq_lane_p8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   ret <16 x i8> [[SHUFFLE]]
 poly8x16_t test_vdupq_lane_p8(poly8x8_t a) {
   return vdupq_lane_p8(a, 7);
 }
 
-// CHECK-LABEL: test_vdupq_lane_p16
-// CHECK: vdup.16 q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <8 x i16> @test_vdupq_lane_p16(<4 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   ret <8 x i16> [[SHUFFLE]]
 poly16x8_t test_vdupq_lane_p16(poly16x4_t a) {
   return vdupq_lane_p16(a, 3);
 }
 
-// CHECK-LABEL: test_vdupq_lane_f32
-// CHECK: vdup.32 q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x float> @test_vdupq_lane_f32(<2 x float> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %a, <2 x float> %a, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   ret <4 x float> [[SHUFFLE]]
 float32x4_t test_vdupq_lane_f32(float32x2_t a) {
   return vdupq_lane_f32(a, 1);
 }
 
-// CHECK-LABEL: test_vdup_lane_s64
+// CHECK-LABEL: define <1 x i64> @test_vdup_lane_s64(<1 x i64> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[SHUFFLE]]
 int64x1_t test_vdup_lane_s64(int64x1_t a) {
   return vdup_lane_s64(a, 0);
 }
 
-// CHECK-LABEL: test_vdup_lane_u64
+// CHECK-LABEL: define <1 x i64> @test_vdup_lane_u64(<1 x i64> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[SHUFFLE]]
 uint64x1_t test_vdup_lane_u64(uint64x1_t a) {
   return vdup_lane_u64(a, 0);
 }
 
-// CHECK-LABEL: test_vdupq_lane_s64
-// CHECK: {{vmov|vdup}}
+// CHECK-LABEL: define <2 x i64> @test_vdupq_lane_s64(<1 x i64> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <2 x i32> zeroinitializer
+// CHECK:   ret <2 x i64> [[SHUFFLE]]
 int64x2_t test_vdupq_lane_s64(int64x1_t a) {
   return vdupq_lane_s64(a, 0);
 }
 
-// CHECK-LABEL: test_vdupq_lane_u64
-// CHECK: {{vmov|vdup}}
+// CHECK-LABEL: define <2 x i64> @test_vdupq_lane_u64(<1 x i64> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <2 x i32> zeroinitializer
+// CHECK:   ret <2 x i64> [[SHUFFLE]]
 uint64x2_t test_vdupq_lane_u64(uint64x1_t a) {
   return vdupq_lane_u64(a, 0);
 }
 
 
-// CHECK-LABEL: test_vdup_n_u8
-// CHECK: vmov 
+// CHECK-LABEL: define <8 x i8> @test_vdup_n_u8(i8 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
+// CHECK:   ret <8 x i8> [[VECINIT7_I]]
 uint8x8_t test_vdup_n_u8(uint8_t a) {
   return vdup_n_u8(a);
 }
 
-// CHECK-LABEL: test_vdup_n_u16
-// CHECK: vmov 
+// CHECK-LABEL: define <4 x i16> @test_vdup_n_u16(i16 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
+// CHECK:   ret <4 x i16> [[VECINIT3_I]]
 uint16x4_t test_vdup_n_u16(uint16_t a) {
   return vdup_n_u16(a);
 }
 
-// CHECK-LABEL: test_vdup_n_u32
-// CHECK: mov 
+// CHECK-LABEL: define <2 x i32> @test_vdup_n_u32(i32 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1
+// CHECK:   ret <2 x i32> [[VECINIT1_I]]
 uint32x2_t test_vdup_n_u32(uint32_t a) {
   return vdup_n_u32(a);
 }
 
-// CHECK-LABEL: test_vdup_n_s8
-// CHECK: vmov 
+// CHECK-LABEL: define <8 x i8> @test_vdup_n_s8(i8 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
+// CHECK:   ret <8 x i8> [[VECINIT7_I]]
 int8x8_t test_vdup_n_s8(int8_t a) {
   return vdup_n_s8(a);
 }
 
-// CHECK-LABEL: test_vdup_n_s16
-// CHECK: vmov 
+// CHECK-LABEL: define <4 x i16> @test_vdup_n_s16(i16 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
+// CHECK:   ret <4 x i16> [[VECINIT3_I]]
 int16x4_t test_vdup_n_s16(int16_t a) {
   return vdup_n_s16(a);
 }
 
-// CHECK-LABEL: test_vdup_n_s32
-// CHECK: mov 
+// CHECK-LABEL: define <2 x i32> @test_vdup_n_s32(i32 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1
+// CHECK:   ret <2 x i32> [[VECINIT1_I]]
 int32x2_t test_vdup_n_s32(int32_t a) {
   return vdup_n_s32(a);
 }
 
-// CHECK-LABEL: test_vdup_n_p8
-// CHECK: vmov 
+// CHECK-LABEL: define <8 x i8> @test_vdup_n_p8(i8 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
+// CHECK:   ret <8 x i8> [[VECINIT7_I]]
 poly8x8_t test_vdup_n_p8(poly8_t a) {
   return vdup_n_p8(a);
 }
 
-// CHECK-LABEL: test_vdup_n_p16
-// CHECK: vmov 
+// CHECK-LABEL: define <4 x i16> @test_vdup_n_p16(i16 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
+// CHECK:   ret <4 x i16> [[VECINIT3_I]]
 poly16x4_t test_vdup_n_p16(poly16_t a) {
   return vdup_n_p16(a);
 }
 
-// CHECK-LABEL: test_vdup_n_f16
-// CHECK: vld1.16 {{{d[0-9]+\[\]}}}
+// CHECK-LABEL: define <4 x half> @test_vdup_n_f16(half* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = load half, half* %a, align 2
+// CHECK:   [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP0]], i32 0
+// CHECK:   [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP0]], i32 1
+// CHECK:   [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT]]1, half [[TMP0]], i32 2
+// CHECK:   [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT]]2, half [[TMP0]], i32 3
+// CHECK:   ret <4 x half> [[VECINIT]]3
 float16x4_t test_vdup_n_f16(float16_t *a) {
   return vdup_n_f16(*a);
 }
 
-// CHECK-LABEL: test_vdup_n_f32
-// CHECK: mov 
+// CHECK-LABEL: define <2 x float> @test_vdup_n_f32(float %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %a, i32 1
+// CHECK:   ret <2 x float> [[VECINIT1_I]]
 float32x2_t test_vdup_n_f32(float32_t a) {
   return vdup_n_f32(a);
 }
 
-// CHECK-LABEL: test_vdupq_n_u8
-// CHECK: vmov 
+// CHECK-LABEL: define <16 x i8> @test_vdupq_n_u8(i8 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
+// CHECK:   [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
+// CHECK:   [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
+// CHECK:   [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
+// CHECK:   [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
+// CHECK:   [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
+// CHECK:   [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
+// CHECK:   [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
+// CHECK:   [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
+// CHECK:   ret <16 x i8> [[VECINIT15_I]]
 uint8x16_t test_vdupq_n_u8(uint8_t a) {
   return vdupq_n_u8(a);
 }
 
-// CHECK-LABEL: test_vdupq_n_u16
-// CHECK: vmov 
+// CHECK-LABEL: define <8 x i16> @test_vdupq_n_u16(i16 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
+// CHECK:   ret <8 x i16> [[VECINIT7_I]]
 uint16x8_t test_vdupq_n_u16(uint16_t a) {
   return vdupq_n_u16(a);
 }
 
-// CHECK-LABEL: test_vdupq_n_u32
-// CHECK: vmov 
+// CHECK-LABEL: define <4 x i32> @test_vdupq_n_u32(i32 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %a, i32 3
+// CHECK:   ret <4 x i32> [[VECINIT3_I]]
 uint32x4_t test_vdupq_n_u32(uint32_t a) {
   return vdupq_n_u32(a);
 }
 
-// CHECK-LABEL: test_vdupq_n_s8
-// CHECK: vmov 
+// CHECK-LABEL: define <16 x i8> @test_vdupq_n_s8(i8 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
+// CHECK:   [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
+// CHECK:   [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
+// CHECK:   [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
+// CHECK:   [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
+// CHECK:   [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
+// CHECK:   [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
+// CHECK:   [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
+// CHECK:   [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
+// CHECK:   ret <16 x i8> [[VECINIT15_I]]
 int8x16_t test_vdupq_n_s8(int8_t a) {
   return vdupq_n_s8(a);
 }
 
-// CHECK-LABEL: test_vdupq_n_s16
-// CHECK: vmov 
+// CHECK-LABEL: define <8 x i16> @test_vdupq_n_s16(i16 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
+// CHECK:   ret <8 x i16> [[VECINIT7_I]]
 int16x8_t test_vdupq_n_s16(int16_t a) {
   return vdupq_n_s16(a);
 }
 
-// CHECK-LABEL: test_vdupq_n_s32
-// CHECK: vmov 
+// CHECK-LABEL: define <4 x i32> @test_vdupq_n_s32(i32 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %a, i32 3
+// CHECK:   ret <4 x i32> [[VECINIT3_I]]
 int32x4_t test_vdupq_n_s32(int32_t a) {
   return vdupq_n_s32(a);
 }
 
-// CHECK-LABEL: test_vdupq_n_p8
-// CHECK: vmov 
+// CHECK-LABEL: define <16 x i8> @test_vdupq_n_p8(i8 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
+// CHECK:   [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
+// CHECK:   [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
+// CHECK:   [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
+// CHECK:   [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
+// CHECK:   [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
+// CHECK:   [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
+// CHECK:   [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
+// CHECK:   [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
+// CHECK:   ret <16 x i8> [[VECINIT15_I]]
 poly8x16_t test_vdupq_n_p8(poly8_t a) {
   return vdupq_n_p8(a);
 }
 
-// CHECK-LABEL: test_vdupq_n_p16
-// CHECK: vmov 
+// CHECK-LABEL: define <8 x i16> @test_vdupq_n_p16(i16 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
+// CHECK:   ret <8 x i16> [[VECINIT7_I]]
 poly16x8_t test_vdupq_n_p16(poly16_t a) {
   return vdupq_n_p16(a);
 }
 
-// CHECK-LABEL: test_vdupq_n_f16
-// CHECK: vld1.16 {{{d[0-9]+\[\], d[0-9]+\[\]}}}
+// CHECK-LABEL: define <8 x half> @test_vdupq_n_f16(half* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = load half, half* %a, align 2
+// CHECK:   [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP0]], i32 0
+// CHECK:   [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP0]], i32 1
+// CHECK:   [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT]]1, half [[TMP0]], i32 2
+// CHECK:   [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT]]2, half [[TMP0]], i32 3
+// CHECK:   [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT]]3, half [[TMP0]], i32 4
+// CHECK:   [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT]]4, half [[TMP0]], i32 5
+// CHECK:   [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT]]5, half [[TMP0]], i32 6
+// CHECK:   [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT]]6, half [[TMP0]], i32 7
+// CHECK:   ret <8 x half> [[VECINIT]]7
 float16x8_t test_vdupq_n_f16(float16_t *a) {
   return vdupq_n_f16(*a);
 }
 
-// CHECK-LABEL: test_vdupq_n_f32
-// CHECK: vmov 
+// CHECK-LABEL: define <4 x float> @test_vdupq_n_f32(float %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %a, i32 3
+// CHECK:   ret <4 x float> [[VECINIT3_I]]
 float32x4_t test_vdupq_n_f32(float32_t a) {
   return vdupq_n_f32(a);
 }
 
-// CHECK-LABEL: test_vdup_n_s64
-// CHECK: vmov
+// CHECK-LABEL: define <1 x i64> @test_vdup_n_s64(i64 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0
+// CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]]
+// CHECK:   ret <1 x i64> [[ADD_I]]
 int64x1_t test_vdup_n_s64(int64_t a) {
   int64x1_t tmp = vdup_n_s64(a);
   return vadd_s64(tmp, tmp);
 }
 
-// CHECK-LABEL: test_vdup_n_u64
-// CHECK: vmov
+// CHECK-LABEL: define <1 x i64> @test_vdup_n_u64(i64 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0
+// CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]]
+// CHECK:   ret <1 x i64> [[ADD_I]]
 uint64x1_t test_vdup_n_u64(uint64_t a) {
   int64x1_t tmp = vdup_n_u64(a);
   return vadd_s64(tmp, tmp);
 
 }
 
-// CHECK-LABEL: test_vdupq_n_s64
-// CHECK: vmov
+// CHECK-LABEL: define <2 x i64> @test_vdupq_n_s64(i64 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[VECINIT1_I]], [[VECINIT1_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vdupq_n_s64(int64_t a) {
   int64x2_t tmp = vdupq_n_s64(a);
   return vaddq_s64(tmp, tmp);
 }
 
-// CHECK-LABEL: test_vdupq_n_u64
-// CHECK: vmov
+// CHECK-LABEL: define <2 x i64> @test_vdupq_n_u64(i64 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[VECINIT1_I]], [[VECINIT1_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vdupq_n_u64(uint64_t a) {
   int64x2_t tmp = vdupq_n_u64(a);
   return vaddq_u64(tmp, tmp);
 }
 
 
-// CHECK-LABEL: test_veor_s8
-// CHECK: veor d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_veor_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[XOR_I]]
 int8x8_t test_veor_s8(int8x8_t a, int8x8_t b) {
   return veor_s8(a, b);
 }
 
-// CHECK-LABEL: test_veor_s16
-// CHECK: veor d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_veor_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[XOR_I]]
 int16x4_t test_veor_s16(int16x4_t a, int16x4_t b) {
   return veor_s16(a, b);
 }
 
-// CHECK-LABEL: test_veor_s32
-// CHECK: veor d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_veor_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[XOR_I]]
 int32x2_t test_veor_s32(int32x2_t a, int32x2_t b) {
   return veor_s32(a, b);
 }
 
-// CHECK-LABEL: test_veor_s64
-// CHECK: veor d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_veor_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[XOR_I]]
 int64x1_t test_veor_s64(int64x1_t a, int64x1_t b) {
   return veor_s64(a, b);
 }
 
-// CHECK-LABEL: test_veor_u8
-// CHECK: veor d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_veor_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[XOR_I]]
 uint8x8_t test_veor_u8(uint8x8_t a, uint8x8_t b) {
   return veor_u8(a, b);
 }
 
-// CHECK-LABEL: test_veor_u16
-// CHECK: veor d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_veor_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[XOR_I]]
 uint16x4_t test_veor_u16(uint16x4_t a, uint16x4_t b) {
   return veor_u16(a, b);
 }
 
-// CHECK-LABEL: test_veor_u32
-// CHECK: veor d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_veor_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[XOR_I]]
 uint32x2_t test_veor_u32(uint32x2_t a, uint32x2_t b) {
   return veor_u32(a, b);
 }
 
-// CHECK-LABEL: test_veor_u64
-// CHECK: veor d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_veor_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[XOR_I]]
 uint64x1_t test_veor_u64(uint64x1_t a, uint64x1_t b) {
   return veor_u64(a, b);
 }
 
-// CHECK-LABEL: test_veorq_s8
-// CHECK: veor q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_veorq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[XOR_I]]
 int8x16_t test_veorq_s8(int8x16_t a, int8x16_t b) {
   return veorq_s8(a, b);
 }
 
-// CHECK-LABEL: test_veorq_s16
-// CHECK: veor q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_veorq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[XOR_I]]
 int16x8_t test_veorq_s16(int16x8_t a, int16x8_t b) {
   return veorq_s16(a, b);
 }
 
-// CHECK-LABEL: test_veorq_s32
-// CHECK: veor q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_veorq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[XOR_I]]
 int32x4_t test_veorq_s32(int32x4_t a, int32x4_t b) {
   return veorq_s32(a, b);
 }
 
-// CHECK-LABEL: test_veorq_s64
-// CHECK: veor q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_veorq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[XOR_I]]
 int64x2_t test_veorq_s64(int64x2_t a, int64x2_t b) {
   return veorq_s64(a, b);
 }
 
-// CHECK-LABEL: test_veorq_u8
-// CHECK: veor q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_veorq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[XOR_I]]
 uint8x16_t test_veorq_u8(uint8x16_t a, uint8x16_t b) {
   return veorq_u8(a, b);
 }
 
-// CHECK-LABEL: test_veorq_u16
-// CHECK: veor q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_veorq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[XOR_I]]
 uint16x8_t test_veorq_u16(uint16x8_t a, uint16x8_t b) {
   return veorq_u16(a, b);
 }
 
-// CHECK-LABEL: test_veorq_u32
-// CHECK: veor q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_veorq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[XOR_I]]
 uint32x4_t test_veorq_u32(uint32x4_t a, uint32x4_t b) {
   return veorq_u32(a, b);
 }
 
-// CHECK-LABEL: test_veorq_u64
-// CHECK: veor q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_veorq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[XOR_I]]
 uint64x2_t test_veorq_u64(uint64x2_t a, uint64x2_t b) {
   return veorq_u64(a, b);
 }
 
 
-// CHECK-LABEL: test_vext_s8
-// CHECK: vext.8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vext_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+// CHECK:   ret <8 x i8> [[VEXT]]
 int8x8_t test_vext_s8(int8x8_t a, int8x8_t b) {
   return vext_s8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vext_u8
-// CHECK: vext.8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vext_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+// CHECK:   ret <8 x i8> [[VEXT]]
 uint8x8_t test_vext_u8(uint8x8_t a, uint8x8_t b) {
   return vext_u8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vext_p8
-// CHECK: vext.8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vext_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+// CHECK:   ret <8 x i8> [[VEXT]]
 poly8x8_t test_vext_p8(poly8x8_t a, poly8x8_t b) {
   return vext_p8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vext_s16
-// CHECK: vext.16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vext_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK:   ret <4 x i16> [[VEXT]]
 int16x4_t test_vext_s16(int16x4_t a, int16x4_t b) {
   return vext_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vext_u16
-// CHECK: vext.16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vext_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK:   ret <4 x i16> [[VEXT]]
 uint16x4_t test_vext_u16(uint16x4_t a, uint16x4_t b) {
   return vext_u16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vext_p16
-// CHECK: vext.16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vext_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK:   ret <4 x i16> [[VEXT]]
 poly16x4_t test_vext_p16(poly16x4_t a, poly16x4_t b) {
   return vext_p16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vext_s32
-// CHECK: vext.32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vext_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VEXT:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK:   ret <2 x i32> [[VEXT]]
 int32x2_t test_vext_s32(int32x2_t a, int32x2_t b) {
   return vext_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vext_u32
-// CHECK: vext.32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vext_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VEXT:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK:   ret <2 x i32> [[VEXT]]
 uint32x2_t test_vext_u32(uint32x2_t a, uint32x2_t b) {
   return vext_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vext_s64
+// CHECK-LABEL: define <1 x i64> @test_vext_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[VEXT]]
 int64x1_t test_vext_s64(int64x1_t a, int64x1_t b) {
   return vext_s64(a, b, 0);
 }
 
-// CHECK-LABEL: test_vext_u64
+// CHECK-LABEL: define <1 x i64> @test_vext_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[VEXT]]
 uint64x1_t test_vext_u64(uint64x1_t a, uint64x1_t b) {
   return vext_u64(a, b, 0);
 }
 
-// CHECK-LABEL: test_vext_f32
-// CHECK: vext.32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vext_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VEXT:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK:   ret <2 x float> [[VEXT]]
 float32x2_t test_vext_f32(float32x2_t a, float32x2_t b) {
   return vext_f32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vextq_s8
-// CHECK: vext.8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vextq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+// CHECK:   ret <16 x i8> [[VEXT]]
 int8x16_t test_vextq_s8(int8x16_t a, int8x16_t b) {
   return vextq_s8(a, b, 15);
 }
 
-// CHECK-LABEL: test_vextq_u8
-// CHECK: vext.8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vextq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+// CHECK:   ret <16 x i8> [[VEXT]]
 uint8x16_t test_vextq_u8(uint8x16_t a, uint8x16_t b) {
   return vextq_u8(a, b, 15);
 }
 
-// CHECK-LABEL: test_vextq_p8
-// CHECK: vext.8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vextq_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+// CHECK:   ret <16 x i8> [[VEXT]]
 poly8x16_t test_vextq_p8(poly8x16_t a, poly8x16_t b) {
   return vextq_p8(a, b, 15);
 }
 
-// CHECK-LABEL: test_vextq_s16
-// CHECK: vext.16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vextq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+// CHECK:   ret <8 x i16> [[VEXT]]
 int16x8_t test_vextq_s16(int16x8_t a, int16x8_t b) {
   return vextq_s16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vextq_u16
-// CHECK: vext.16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vextq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+// CHECK:   ret <8 x i16> [[VEXT]]
 uint16x8_t test_vextq_u16(uint16x8_t a, uint16x8_t b) {
   return vextq_u16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vextq_p16
-// CHECK: vext.16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vextq_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+// CHECK:   ret <8 x i16> [[VEXT]]
 poly16x8_t test_vextq_p16(poly16x8_t a, poly16x8_t b) {
   return vextq_p16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vextq_s32
-// CHECK: vext.32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vextq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK:   ret <4 x i32> [[VEXT]]
 int32x4_t test_vextq_s32(int32x4_t a, int32x4_t b) {
   return vextq_s32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vextq_u32
-// CHECK: vext.32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vextq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK:   ret <4 x i32> [[VEXT]]
 uint32x4_t test_vextq_u32(uint32x4_t a, uint32x4_t b) {
   return vextq_u32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vextq_s64
-// CHECK: {{vmov|vdup}}
+// CHECK-LABEL: define <2 x i64> @test_vextq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK:   ret <2 x i64> [[VEXT]]
 int64x2_t test_vextq_s64(int64x2_t a, int64x2_t b) {
   return vextq_s64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vextq_u64
-// CHECK: {{vmov|vdup}}
+// CHECK-LABEL: define <2 x i64> @test_vextq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK:   ret <2 x i64> [[VEXT]]
 uint64x2_t test_vextq_u64(uint64x2_t a, uint64x2_t b) {
   return vextq_u64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vextq_f32
-// CHECK: vext.32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vextq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK:   ret <4 x float> [[VEXT]]
 float32x4_t test_vextq_f32(float32x4_t a, float32x4_t b) {
   return vextq_f32(a, b, 3);
 }
 
 
-// CHECK-LABEL: test_vfma_f32
-// CHECK: vfma.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vfma_f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %c to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> [[TMP3]]) #4
+// CHECK:   ret <2 x float> [[TMP6]]
 float32x2_t test_vfma_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
   return vfma_f32(a, b, c);
 }
 
-// CHECK-LABEL: test_vfmaq_f32
-// CHECK: vfma.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vfmaq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %c to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x float> [[TMP3]]) #4
+// CHECK:   ret <4 x float> [[TMP6]]
 float32x4_t test_vfmaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
   return vfmaq_f32(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vget_high_s8
+// CHECK-LABEL: define <8 x i8> @test_vget_high_s8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vget_high_s8(int8x16_t a) {
   return vget_high_s8(a);
 }
 
-// CHECK-LABEL: test_vget_high_s16
+// CHECK-LABEL: define <4 x i16> @test_vget_high_s16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vget_high_s16(int16x8_t a) {
   return vget_high_s16(a);
 }
 
-// CHECK-LABEL: test_vget_high_s32
+// CHECK-LABEL: define <2 x i32> @test_vget_high_s32(<4 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vget_high_s32(int32x4_t a) {
   return vget_high_s32(a);
 }
 
-// CHECK-LABEL: test_vget_high_s64
+// CHECK-LABEL: define <1 x i64> @test_vget_high_s64(<2 x i64> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> <i32 1>
+// CHECK:   ret <1 x i64> [[SHUFFLE_I]]
 int64x1_t test_vget_high_s64(int64x2_t a) {
   return vget_high_s64(a);
 }
 
-// CHECK-LABEL: test_vget_high_f16
+// CHECK-LABEL: define <4 x half> @test_vget_high_f16(<8 x half> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <4 x half> [[SHUFFLE_I]]
 float16x4_t test_vget_high_f16(float16x8_t a) {
   return vget_high_f16(a);
 }
 
-// CHECK-LABEL: test_vget_high_f32
+// CHECK-LABEL: define <2 x float> @test_vget_high_f32(<4 x float> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vget_high_f32(float32x4_t a) {
   return vget_high_f32(a);
 }
 
-// CHECK-LABEL: test_vget_high_u8
+// CHECK-LABEL: define <8 x i8> @test_vget_high_u8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vget_high_u8(uint8x16_t a) {
   return vget_high_u8(a);
 }
 
-// CHECK-LABEL: test_vget_high_u16
+// CHECK-LABEL: define <4 x i16> @test_vget_high_u16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vget_high_u16(uint16x8_t a) {
   return vget_high_u16(a);
 }
 
-// CHECK-LABEL: test_vget_high_u32
+// CHECK-LABEL: define <2 x i32> @test_vget_high_u32(<4 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vget_high_u32(uint32x4_t a) {
   return vget_high_u32(a);
 }
 
-// CHECK-LABEL: test_vget_high_u64
+// CHECK-LABEL: define <1 x i64> @test_vget_high_u64(<2 x i64> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> <i32 1>
+// CHECK:   ret <1 x i64> [[SHUFFLE_I]]
 uint64x1_t test_vget_high_u64(uint64x2_t a) {
   return vget_high_u64(a);
 }
 
-// CHECK-LABEL: test_vget_high_p8
+// CHECK-LABEL: define <8 x i8> @test_vget_high_p8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vget_high_p8(poly8x16_t a) {
   return vget_high_p8(a);
 }
 
-// CHECK-LABEL: test_vget_high_p16
+// CHECK-LABEL: define <4 x i16> @test_vget_high_p16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vget_high_p16(poly16x8_t a) {
   return vget_high_p16(a);
 }
 
 
-// CHECK-LABEL: test_vget_lane_u8
-// CHECK: vmov 
+// CHECK-LABEL: define i8 @test_vget_lane_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
+// CHECK:   ret i8 [[VGET_LANE]]
 uint8_t test_vget_lane_u8(uint8x8_t a) {
   return vget_lane_u8(a, 7);
 }
 
-// CHECK-LABEL: test_vget_lane_u16
-// CHECK: vmov 
+// CHECK-LABEL: define i16 @test_vget_lane_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   ret i16 [[VGET_LANE]]
 uint16_t test_vget_lane_u16(uint16x4_t a) {
   return vget_lane_u16(a, 3);
 }
 
-// CHECK-LABEL: test_vget_lane_u32
-// CHECK: mov 
+// CHECK-LABEL: define i32 @test_vget_lane_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK:   ret i32 [[VGET_LANE]]
 uint32_t test_vget_lane_u32(uint32x2_t a) {
   return vget_lane_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vget_lane_s8
-// CHECK: vmov 
+// CHECK-LABEL: define i8 @test_vget_lane_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
+// CHECK:   ret i8 [[VGET_LANE]]
 int8_t test_vget_lane_s8(int8x8_t a) {
   return vget_lane_s8(a, 7);
 }
 
-// CHECK-LABEL: test_vget_lane_s16
-// CHECK: vmov 
+// CHECK-LABEL: define i16 @test_vget_lane_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   ret i16 [[VGET_LANE]]
 int16_t test_vget_lane_s16(int16x4_t a) {
   return vget_lane_s16(a, 3);
 }
 
-// CHECK-LABEL: test_vget_lane_s32
-// CHECK: mov 
+// CHECK-LABEL: define i32 @test_vget_lane_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK:   ret i32 [[VGET_LANE]]
 int32_t test_vget_lane_s32(int32x2_t a) {
   return vget_lane_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vget_lane_p8
-// CHECK: vmov 
+// CHECK-LABEL: define i8 @test_vget_lane_p8(<8 x i8> %a) #0 {
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
+// CHECK:   ret i8 [[VGET_LANE]]
 poly8_t test_vget_lane_p8(poly8x8_t a) {
   return vget_lane_p8(a, 7);
 }
 
-// CHECK-LABEL: test_vget_lane_p16
-// CHECK: vmov 
+// CHECK-LABEL: define i16 @test_vget_lane_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   ret i16 [[VGET_LANE]]
 poly16_t test_vget_lane_p16(poly16x4_t a) {
   return vget_lane_p16(a, 3);
 }
 
-// CHECK-LABEL: test_vget_lane_f32
-// CHECK: vmov 
+// CHECK-LABEL: define float @test_vget_lane_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+// CHECK:   ret float [[VGET_LANE]]
 float32_t test_vget_lane_f32(float32x2_t a) {
   return vget_lane_f32(a, 1);
 }
 
-// CHECK-LABEL: test_vget_lane_f16
-// CHECK: vmov 
+// CHECK-LABEL: define float @test_vget_lane_f16(<4 x half> %a) #0 {
+// CHECK:   [[__REINT_242:%.*]] = alloca <4 x half>, align 8
+// CHECK:   [[__REINT1_242:%.*]] = alloca i16, align 2
+// CHECK:   store <4 x half> %a, <4 x half>* [[__REINT_242]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_242]] to <4 x i16>*
+// CHECK:   [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 8
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP3]], i32 1
+// CHECK:   store i16 [[VGET_LANE]], i16* [[__REINT1_242]], align 2
+// CHECK:   [[TMP4:%.*]] = bitcast i16* [[__REINT1_242]] to half*
+// CHECK:   [[TMP5:%.*]] = load half, half* [[TMP4]], align 2
+// CHECK:   [[CONV:%.*]] = fpext half [[TMP5]] to float
+// CHECK:   ret float [[CONV]]
 float32_t test_vget_lane_f16(float16x4_t a) {
   return vget_lane_f16(a, 1);
 }
 
-// CHECK-LABEL: test_vgetq_lane_u8
-// CHECK: vmov 
+// CHECK-LABEL: define i8 @test_vgetq_lane_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
+// CHECK:   ret i8 [[VGETQ_LANE]]
 uint8_t test_vgetq_lane_u8(uint8x16_t a) {
   return vgetq_lane_u8(a, 15);
 }
 
-// CHECK-LABEL: test_vgetq_lane_u16
-// CHECK: vmov 
+// CHECK-LABEL: define i16 @test_vgetq_lane_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   ret i16 [[VGETQ_LANE]]
 uint16_t test_vgetq_lane_u16(uint16x8_t a) {
   return vgetq_lane_u16(a, 7);
 }
 
-// CHECK-LABEL: test_vgetq_lane_u32
-// CHECK: vmov 
+// CHECK-LABEL: define i32 @test_vgetq_lane_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK:   ret i32 [[VGETQ_LANE]]
 uint32_t test_vgetq_lane_u32(uint32x4_t a) {
   return vgetq_lane_u32(a, 3);
 }
 
-// CHECK-LABEL: test_vgetq_lane_s8
-// CHECK: vmov 
+// CHECK-LABEL: define i8 @test_vgetq_lane_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
+// CHECK:   ret i8 [[VGETQ_LANE]]
 int8_t test_vgetq_lane_s8(int8x16_t a) {
   return vgetq_lane_s8(a, 15);
 }
 
-// CHECK-LABEL: test_vgetq_lane_s16
-// CHECK: vmov 
+// CHECK-LABEL: define i16 @test_vgetq_lane_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   ret i16 [[VGETQ_LANE]]
 int16_t test_vgetq_lane_s16(int16x8_t a) {
   return vgetq_lane_s16(a, 7);
 }
 
-// CHECK-LABEL: test_vgetq_lane_s32
-// CHECK: vmov 
+// CHECK-LABEL: define i32 @test_vgetq_lane_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK:   ret i32 [[VGETQ_LANE]]
 int32_t test_vgetq_lane_s32(int32x4_t a) {
   return vgetq_lane_s32(a, 3);
 }
 
-// CHECK-LABEL: test_vgetq_lane_p8
-// CHECK: vmov 
+// CHECK-LABEL: define i8 @test_vgetq_lane_p8(<16 x i8> %a) #0 {
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
+// CHECK:   ret i8 [[VGETQ_LANE]]
 poly8_t test_vgetq_lane_p8(poly8x16_t a) {
   return vgetq_lane_p8(a, 15);
 }
 
-// CHECK-LABEL: test_vgetq_lane_p16
-// CHECK: vmov 
+// CHECK-LABEL: define i16 @test_vgetq_lane_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   ret i16 [[VGETQ_LANE]]
 poly16_t test_vgetq_lane_p16(poly16x8_t a) {
   return vgetq_lane_p16(a, 7);
 }
 
-// CHECK-LABEL: test_vgetq_lane_f32
-// CHECK: vmov 
+// CHECK-LABEL: define float @test_vgetq_lane_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+// CHECK:   ret float [[VGETQ_LANE]]
 float32_t test_vgetq_lane_f32(float32x4_t a) {
   return vgetq_lane_f32(a, 3);
 }
 
-// CHECK-LABEL: test_vgetq_lane_f16
-// CHECK: vmov 
+// CHECK-LABEL: define float @test_vgetq_lane_f16(<8 x half> %a) #0 {
+// CHECK:   [[__REINT_244:%.*]] = alloca <8 x half>, align 16
+// CHECK:   [[__REINT1_244:%.*]] = alloca i16, align 2
+// CHECK:   store <8 x half> %a, <8 x half>* [[__REINT_244]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_244]] to <8 x i16>*
+// CHECK:   [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 16
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
+// CHECK:   store i16 [[VGETQ_LANE]], i16* [[__REINT1_244]], align 2
+// CHECK:   [[TMP4:%.*]] = bitcast i16* [[__REINT1_244]] to half*
+// CHECK:   [[TMP5:%.*]] = load half, half* [[TMP4]], align 2
+// CHECK:   [[CONV:%.*]] = fpext half [[TMP5]] to float
+// CHECK:   ret float [[CONV]]
 float32_t test_vgetq_lane_f16(float16x8_t a) {
   return vgetq_lane_f16(a, 3);
 }
 
-// CHECK-LABEL: test_vget_lane_s64
 // The optimizer is able to remove all moves now.
+// CHECK-LABEL: define i64 @test_vget_lane_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
+// CHECK:   ret i64 [[VGET_LANE]]
 int64_t test_vget_lane_s64(int64x1_t a) {
   return vget_lane_s64(a, 0);
 }
 
-// CHECK-LABEL: test_vget_lane_u64
 // The optimizer is able to remove all moves now.
+// CHECK-LABEL: define i64 @test_vget_lane_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
+// CHECK:   ret i64 [[VGET_LANE]]
 uint64_t test_vget_lane_u64(uint64x1_t a) {
   return vget_lane_u64(a, 0);
 }
 
-// CHECK-LABEL: test_vgetq_lane_s64
-// CHECK: vmov 
+// CHECK-LABEL: define i64 @test_vgetq_lane_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+// CHECK:   ret i64 [[VGETQ_LANE]]
 int64_t test_vgetq_lane_s64(int64x2_t a) {
   return vgetq_lane_s64(a, 1);
 }
 
-// CHECK-LABEL: test_vgetq_lane_u64
-// CHECK: vmov 
+// CHECK-LABEL: define i64 @test_vgetq_lane_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+// CHECK:   ret i64 [[VGETQ_LANE]]
 uint64_t test_vgetq_lane_u64(uint64x2_t a) {
   return vgetq_lane_u64(a, 1);
 }
 
 
-// CHECK-LABEL: test_vget_low_s8
+// CHECK-LABEL: define <8 x i8> @test_vget_low_s8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vget_low_s8(int8x16_t a) {
   return vget_low_s8(a);
 }
 
-// CHECK-LABEL: test_vget_low_s16
+// CHECK-LABEL: define <4 x i16> @test_vget_low_s16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vget_low_s16(int16x8_t a) {
   return vget_low_s16(a);
 }
 
-// CHECK-LABEL: test_vget_low_s32
+// CHECK-LABEL: define <2 x i32> @test_vget_low_s32(<4 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vget_low_s32(int32x4_t a) {
   return vget_low_s32(a);
 }
 
-// CHECK-LABEL: test_vget_low_s64
+// CHECK-LABEL: define <1 x i64> @test_vget_low_s64(<2 x i64> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[SHUFFLE_I]]
 int64x1_t test_vget_low_s64(int64x2_t a) {
   return vget_low_s64(a);
 }
 
-// CHECK-LABEL: test_vget_low_f16
+// CHECK-LABEL: define <4 x half> @test_vget_low_f16(<8 x half> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x half> [[SHUFFLE_I]]
 float16x4_t test_vget_low_f16(float16x8_t a) {
   return vget_low_f16(a);
 }
 
-// CHECK-LABEL: test_vget_low_f32
+// CHECK-LABEL: define <2 x float> @test_vget_low_f32(<4 x float> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vget_low_f32(float32x4_t a) {
   return vget_low_f32(a);
 }
 
-// CHECK-LABEL: test_vget_low_u8
+// CHECK-LABEL: define <8 x i8> @test_vget_low_u8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vget_low_u8(uint8x16_t a) {
   return vget_low_u8(a);
 }
 
-// CHECK-LABEL: test_vget_low_u16
+// CHECK-LABEL: define <4 x i16> @test_vget_low_u16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vget_low_u16(uint16x8_t a) {
   return vget_low_u16(a);
 }
 
-// CHECK-LABEL: test_vget_low_u32
+// CHECK-LABEL: define <2 x i32> @test_vget_low_u32(<4 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vget_low_u32(uint32x4_t a) {
   return vget_low_u32(a);
 }
 
-// CHECK-LABEL: test_vget_low_u64
+// CHECK-LABEL: define <1 x i64> @test_vget_low_u64(<2 x i64> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[SHUFFLE_I]]
 uint64x1_t test_vget_low_u64(uint64x2_t a) {
   return vget_low_u64(a);
 }
 
-// CHECK-LABEL: test_vget_low_p8
+// CHECK-LABEL: define <8 x i8> @test_vget_low_p8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vget_low_p8(poly8x16_t a) {
   return vget_low_p8(a);
 }
 
-// CHECK-LABEL: test_vget_low_p16
+// CHECK-LABEL: define <4 x i16> @test_vget_low_p16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vget_low_p16(poly16x8_t a) {
   return vget_low_p16(a);
 }
 
 
-// CHECK-LABEL: test_vhadd_s8
-// CHECK: vhadd.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vhadd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VHADD_V_I]]
 int8x8_t test_vhadd_s8(int8x8_t a, int8x8_t b) {
   return vhadd_s8(a, b);
 }
 
-// CHECK-LABEL: test_vhadd_s16
-// CHECK: vhadd.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vhadd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16> [[VHADD_V_I]], <4 x i16> [[VHADD_V1_I]]) #4
+// CHECK:   [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vhadd_s16(int16x4_t a, int16x4_t b) {
   return vhadd_s16(a, b);
 }
 
-// CHECK-LABEL: test_vhadd_s32
-// CHECK: vhadd.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vhadd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32> [[VHADD_V_I]], <2 x i32> [[VHADD_V1_I]]) #4
+// CHECK:   [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vhadd_s32(int32x2_t a, int32x2_t b) {
   return vhadd_s32(a, b);
 }
 
-// CHECK-LABEL: test_vhadd_u8
-// CHECK: vhadd.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vhadd_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VHADD_V_I]]
 uint8x8_t test_vhadd_u8(uint8x8_t a, uint8x8_t b) {
   return vhadd_u8(a, b);
 }
 
-// CHECK-LABEL: test_vhadd_u16
-// CHECK: vhadd.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vhadd_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uhadd.v4i16(<4 x i16> [[VHADD_V_I]], <4 x i16> [[VHADD_V1_I]]) #4
+// CHECK:   [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vhadd_u16(uint16x4_t a, uint16x4_t b) {
   return vhadd_u16(a, b);
 }
 
-// CHECK-LABEL: test_vhadd_u32
-// CHECK: vhadd.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vhadd_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uhadd.v2i32(<2 x i32> [[VHADD_V_I]], <2 x i32> [[VHADD_V1_I]]) #4
+// CHECK:   [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vhadd_u32(uint32x2_t a, uint32x2_t b) {
   return vhadd_u32(a, b);
 }
 
-// CHECK-LABEL: test_vhaddq_s8
-// CHECK: vhadd.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vhaddq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.shadd.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VHADDQ_V_I]]
 int8x16_t test_vhaddq_s8(int8x16_t a, int8x16_t b) {
   return vhaddq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vhaddq_s16
-// CHECK: vhadd.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vhaddq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> [[VHADDQ_V_I]], <8 x i16> [[VHADDQ_V1_I]]) #4
+// CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vhaddq_s16(int16x8_t a, int16x8_t b) {
   return vhaddq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vhaddq_s32
-// CHECK: vhadd.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vhaddq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.shadd.v4i32(<4 x i32> [[VHADDQ_V_I]], <4 x i32> [[VHADDQ_V1_I]]) #4
+// CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vhaddq_s32(int32x4_t a, int32x4_t b) {
   return vhaddq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vhaddq_u8
-// CHECK: vhadd.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vhaddq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uhadd.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VHADDQ_V_I]]
 uint8x16_t test_vhaddq_u8(uint8x16_t a, uint8x16_t b) {
   return vhaddq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vhaddq_u16
-// CHECK: vhadd.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vhaddq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> [[VHADDQ_V_I]], <8 x i16> [[VHADDQ_V1_I]]) #4
+// CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vhaddq_u16(uint16x8_t a, uint16x8_t b) {
   return vhaddq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vhaddq_u32
-// CHECK: vhadd.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vhaddq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uhadd.v4i32(<4 x i32> [[VHADDQ_V_I]], <4 x i32> [[VHADDQ_V1_I]]) #4
+// CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vhaddq_u32(uint32x4_t a, uint32x4_t b) {
   return vhaddq_u32(a, b);
 }
 
 
-// CHECK-LABEL: test_vhsub_s8
-// CHECK: vhsub.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vhsub_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.shsub.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VHSUB_V_I]]
 int8x8_t test_vhsub_s8(int8x8_t a, int8x8_t b) {
   return vhsub_s8(a, b);
 }
 
-// CHECK-LABEL: test_vhsub_s16
-// CHECK: vhsub.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vhsub_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.shsub.v4i16(<4 x i16> [[VHSUB_V_I]], <4 x i16> [[VHSUB_V1_I]]) #4
+// CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <4 x i16> [[VHSUB_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vhsub_s16(int16x4_t a, int16x4_t b) {
   return vhsub_s16(a, b);
 }
 
-// CHECK-LABEL: test_vhsub_s32
-// CHECK: vhsub.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vhsub_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.shsub.v2i32(<2 x i32> [[VHSUB_V_I]], <2 x i32> [[VHSUB_V1_I]]) #4
+// CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <2 x i32> [[VHSUB_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vhsub_s32(int32x2_t a, int32x2_t b) {
   return vhsub_s32(a, b);
 }
 
-// CHECK-LABEL: test_vhsub_u8
-// CHECK: vhsub.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vhsub_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uhsub.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VHSUB_V_I]]
 uint8x8_t test_vhsub_u8(uint8x8_t a, uint8x8_t b) {
   return vhsub_u8(a, b);
 }
 
-// CHECK-LABEL: test_vhsub_u16
-// CHECK: vhsub.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vhsub_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uhsub.v4i16(<4 x i16> [[VHSUB_V_I]], <4 x i16> [[VHSUB_V1_I]]) #4
+// CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <4 x i16> [[VHSUB_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vhsub_u16(uint16x4_t a, uint16x4_t b) {
   return vhsub_u16(a, b);
 }
 
-// CHECK-LABEL: test_vhsub_u32
-// CHECK: vhsub.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vhsub_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uhsub.v2i32(<2 x i32> [[VHSUB_V_I]], <2 x i32> [[VHSUB_V1_I]]) #4
+// CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <2 x i32> [[VHSUB_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vhsub_u32(uint32x2_t a, uint32x2_t b) {
   return vhsub_u32(a, b);
 }
 
-// CHECK-LABEL: test_vhsubq_s8
-// CHECK: vhsub.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vhsubq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.shsub.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VHSUBQ_V_I]]
 int8x16_t test_vhsubq_s8(int8x16_t a, int8x16_t b) {
   return vhsubq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vhsubq_s16
-// CHECK: vhsub.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vhsubq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.shsub.v8i16(<8 x i16> [[VHSUBQ_V_I]], <8 x i16> [[VHSUBQ_V1_I]]) #4
+// CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VHSUBQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vhsubq_s16(int16x8_t a, int16x8_t b) {
   return vhsubq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vhsubq_s32
-// CHECK: vhsub.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vhsubq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.shsub.v4i32(<4 x i32> [[VHSUBQ_V_I]], <4 x i32> [[VHSUBQ_V1_I]]) #4
+// CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VHSUBQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vhsubq_s32(int32x4_t a, int32x4_t b) {
   return vhsubq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vhsubq_u8
-// CHECK: vhsub.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vhsubq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uhsub.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VHSUBQ_V_I]]
 uint8x16_t test_vhsubq_u8(uint8x16_t a, uint8x16_t b) {
   return vhsubq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vhsubq_u16
-// CHECK: vhsub.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vhsubq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uhsub.v8i16(<8 x i16> [[VHSUBQ_V_I]], <8 x i16> [[VHSUBQ_V1_I]]) #4
+// CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VHSUBQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vhsubq_u16(uint16x8_t a, uint16x8_t b) {
   return vhsubq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vhsubq_u32
-// CHECK: vhsub.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vhsubq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uhsub.v4i32(<4 x i32> [[VHSUBQ_V_I]], <4 x i32> [[VHSUBQ_V1_I]]) #4
+// CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VHSUBQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vhsubq_u32(uint32x4_t a, uint32x4_t b) {
   return vhsubq_u32(a, b);
 }
 
 
-// CHECK-LABEL: test_vld1q_u8
-// CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <16 x i8> @test_vld1q_u8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]]
+// CHECK:   ret <16 x i8> [[TMP1]]
 uint8x16_t test_vld1q_u8(uint8_t const * a) {
   return vld1q_u8(a);
 }
 
-// CHECK-LABEL: test_vld1q_u16
-// CHECK: vld1.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <8 x i16> @test_vld1q_u16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]]
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vld1q_u16(uint16_t const * a) {
   return vld1q_u16(a);
 }
 
-// CHECK-LABEL: test_vld1q_u32
-// CHECK: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <4 x i32> @test_vld1q_u32(i32* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]]
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vld1q_u32(uint32_t const * a) {
   return vld1q_u32(a);
 }
 
-// CHECK-LABEL: test_vld1q_u64
-// CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}{{(:64)?}}]
+// CHECK-LABEL: define <2 x i64> @test_vld1q_u64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x i64>*
+// CHECK:   [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]]
+// CHECK:   ret <2 x i64> [[TMP2]]
 uint64x2_t test_vld1q_u64(uint64_t const * a) {
   return vld1q_u64(a);
 }
 
-// CHECK-LABEL: test_vld1q_s8
-// CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <16 x i8> @test_vld1q_s8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]]
+// CHECK:   ret <16 x i8> [[TMP1]]
 int8x16_t test_vld1q_s8(int8_t const * a) {
   return vld1q_s8(a);
 }
 
-// CHECK-LABEL: test_vld1q_s16
-// CHECK: vld1.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <8 x i16> @test_vld1q_s16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]]
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vld1q_s16(int16_t const * a) {
   return vld1q_s16(a);
 }
 
-// CHECK-LABEL: test_vld1q_s32
-// CHECK: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <4 x i32> @test_vld1q_s32(i32* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]]
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vld1q_s32(int32_t const * a) {
   return vld1q_s32(a);
 }
 
-// CHECK-LABEL: test_vld1q_s64
-// CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}{{(:64)?}}]
+// CHECK-LABEL: define <2 x i64> @test_vld1q_s64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x i64>*
+// CHECK:   [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]]
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vld1q_s64(int64_t const * a) {
   return vld1q_s64(a);
 }
 
-// CHECK-LABEL: test_vld1q_f16
-// CHECK: vld1.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <8 x half> @test_vld1q_f16(half* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <8 x half>
+// CHECK:   ret <8 x half> [[TMP3]]
 float16x8_t test_vld1q_f16(float16_t const * a) {
   return vld1q_f16(a);
 }
 
-// CHECK-LABEL: test_vld1q_f32
-// CHECK: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <4 x float> @test_vld1q_f32(float* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
+// CHECK:   [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]]
+// CHECK:   ret <4 x float> [[TMP2]]
 float32x4_t test_vld1q_f32(float32_t const * a) {
   return vld1q_f32(a);
 }
 
-// CHECK-LABEL: test_vld1q_p8
-// CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <16 x i8> @test_vld1q_p8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]]
+// CHECK:   ret <16 x i8> [[TMP1]]
 poly8x16_t test_vld1q_p8(poly8_t const * a) {
   return vld1q_p8(a);
 }
 
-// CHECK-LABEL: test_vld1q_p16
-// CHECK: vld1.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <8 x i16> @test_vld1q_p16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]]
+// CHECK:   ret <8 x i16> [[TMP2]]
 poly16x8_t test_vld1q_p16(poly16_t const * a) {
   return vld1q_p16(a);
 }
 
-// CHECK-LABEL: test_vld1_u8
-// CHECK: vld1.8 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <8 x i8> @test_vld1_u8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]]
+// CHECK:   ret <8 x i8> [[TMP1]]
 uint8x8_t test_vld1_u8(uint8_t const * a) {
   return vld1_u8(a);
 }
 
-// CHECK-LABEL: test_vld1_u16
-// CHECK: vld1.16 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <4 x i16> @test_vld1_u16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]]
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vld1_u16(uint16_t const * a) {
   return vld1_u16(a);
 }
 
-// CHECK-LABEL: test_vld1_u32
-// CHECK: vld1.32 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <2 x i32> @test_vld1_u32(i32* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]]
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vld1_u32(uint32_t const * a) {
   return vld1_u32(a);
 }
 
-// CHECK-LABEL: test_vld1_u64
-// CHECK: vld1.64 {d{{[0-9]+}}}, [r{{[0-9]+}}{{(:64)?}}]
+// CHECK-LABEL: define <1 x i64> @test_vld1_u64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <1 x i64>*
+// CHECK:   [[TMP2:%.*]] = load <1 x i64>, <1 x i64>* [[TMP1]]
+// CHECK:   ret <1 x i64> [[TMP2]]
 uint64x1_t test_vld1_u64(uint64_t const * a) {
   return vld1_u64(a);
 }
 
-// CHECK-LABEL: test_vld1_s8
-// CHECK: vld1.8 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <8 x i8> @test_vld1_s8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]]
+// CHECK:   ret <8 x i8> [[TMP1]]
 int8x8_t test_vld1_s8(int8_t const * a) {
   return vld1_s8(a);
 }
 
-// CHECK-LABEL: test_vld1_s16
-// CHECK: vld1.16 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <4 x i16> @test_vld1_s16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]]
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vld1_s16(int16_t const * a) {
   return vld1_s16(a);
 }
 
-// CHECK-LABEL: test_vld1_s32
-// CHECK: vld1.32 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <2 x i32> @test_vld1_s32(i32* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]]
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vld1_s32(int32_t const * a) {
   return vld1_s32(a);
 }
 
-// CHECK-LABEL: test_vld1_s64
-// CHECK: vld1.64 {d{{[0-9]+}}}, [r{{[0-9]+}}{{(:64)?}}]
+// CHECK-LABEL: define <1 x i64> @test_vld1_s64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <1 x i64>*
+// CHECK:   [[TMP2:%.*]] = load <1 x i64>, <1 x i64>* [[TMP1]]
+// CHECK:   ret <1 x i64> [[TMP2]]
 int64x1_t test_vld1_s64(int64_t const * a) {
   return vld1_s64(a);
 }
 
-// CHECK-LABEL: test_vld1_f16
-// CHECK: vld1.16 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <4 x half> @test_vld1_f16(half* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <4 x half>
+// CHECK:   ret <4 x half> [[TMP3]]
 float16x4_t test_vld1_f16(float16_t const * a) {
   return vld1_f16(a);
 }
 
-// CHECK-LABEL: test_vld1_f32
-// CHECK: vld1.32 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <2 x float> @test_vld1_f32(float* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
+// CHECK:   [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]]
+// CHECK:   ret <2 x float> [[TMP2]]
 float32x2_t test_vld1_f32(float32_t const * a) {
   return vld1_f32(a);
 }
 
-// CHECK-LABEL: test_vld1_p8
-// CHECK: vld1.8 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <8 x i8> @test_vld1_p8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]]
+// CHECK:   ret <8 x i8> [[TMP1]]
 poly8x8_t test_vld1_p8(poly8_t const * a) {
   return vld1_p8(a);
 }
 
-// CHECK-LABEL: test_vld1_p16
-// CHECK: vld1.16 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <4 x i16> @test_vld1_p16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]]
+// CHECK:   ret <4 x i16> [[TMP2]]
 poly16x4_t test_vld1_p16(poly16_t const * a) {
   return vld1_p16(a);
 }
 
 
-// CHECK-LABEL: test_vld1q_dup_u8
-// CHECK: vld1.8 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <16 x i8> @test_vld1q_dup_u8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
+// CHECK:   ret <16 x i8> [[LANE]]
 uint8x16_t test_vld1q_dup_u8(uint8_t const * a) {
   return vld1q_dup_u8(a);
 }
 
-// CHECK-LABEL: test_vld1q_dup_u16
-// CHECK: vld1.16 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define <8 x i16> @test_vld1q_dup_u16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
+// CHECK:   ret <8 x i16> [[LANE]]
 uint16x8_t test_vld1q_dup_u16(uint16_t const * a) {
   return vld1q_dup_u16(a);
 }
 
-// CHECK-LABEL: test_vld1q_dup_u32
-// CHECK: vld1.32 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: define <4 x i32> @test_vld1q_dup_u32(i32* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP2:%.*]] = load i32, i32* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   ret <4 x i32> [[LANE]]
 uint32x4_t test_vld1q_dup_u32(uint32_t const * a) {
   return vld1q_dup_u32(a);
 }
 
-// CHECK-LABEL: test_vld1q_dup_u64
-// CHECK: {{ldr|vldr|vmov}}
+// CHECK-LABEL: define <2 x i64> @test_vld1q_dup_u64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   ret <2 x i64> [[LANE]]
 uint64x2_t test_vld1q_dup_u64(uint64_t const * a) {
   return vld1q_dup_u64(a);
 }
 
-// CHECK-LABEL: test_vld1q_dup_s8
-// CHECK: vld1.8 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <16 x i8> @test_vld1q_dup_s8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
+// CHECK:   ret <16 x i8> [[LANE]]
 int8x16_t test_vld1q_dup_s8(int8_t const * a) {
   return vld1q_dup_s8(a);
 }
 
-// CHECK-LABEL: test_vld1q_dup_s16
-// CHECK: vld1.16 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define <8 x i16> @test_vld1q_dup_s16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
+// CHECK:   ret <8 x i16> [[LANE]]
 int16x8_t test_vld1q_dup_s16(int16_t const * a) {
   return vld1q_dup_s16(a);
 }
 
-// CHECK-LABEL: test_vld1q_dup_s32
-// CHECK: vld1.32 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: define <4 x i32> @test_vld1q_dup_s32(i32* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP2:%.*]] = load i32, i32* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   ret <4 x i32> [[LANE]]
 int32x4_t test_vld1q_dup_s32(int32_t const * a) {
   return vld1q_dup_s32(a);
 }
 
-// CHECK-LABEL: test_vld1q_dup_s64
-// CHECK: {{ldr|vldr|vmov}}
+// CHECK-LABEL: define <2 x i64> @test_vld1q_dup_s64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   ret <2 x i64> [[LANE]]
 int64x2_t test_vld1q_dup_s64(int64_t const * a) {
   return vld1q_dup_s64(a);
 }
 
-// CHECK-LABEL: test_vld1q_dup_f16
-// CHECK: vld1.16 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define <8 x half> @test_vld1q_dup_f16(half* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[LANE]] to <8 x half>
+// CHECK:   ret <8 x half> [[TMP4]]
 float16x8_t test_vld1q_dup_f16(float16_t const * a) {
   return vld1q_dup_f16(a);
 }
 
-// CHECK-LABEL: test_vld1q_dup_f32
-// CHECK: vld1.32 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: define <4 x float> @test_vld1q_dup_f32(float* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to float*
+// CHECK:   [[TMP2:%.*]] = load float, float* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   ret <4 x float> [[LANE]]
 float32x4_t test_vld1q_dup_f32(float32_t const * a) {
   return vld1q_dup_f32(a);
 }
 
-// CHECK-LABEL: test_vld1q_dup_p8
-// CHECK: vld1.8 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <16 x i8> @test_vld1q_dup_p8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
+// CHECK:   ret <16 x i8> [[LANE]]
 poly8x16_t test_vld1q_dup_p8(poly8_t const * a) {
   return vld1q_dup_p8(a);
 }
 
-// CHECK-LABEL: test_vld1q_dup_p16
-// CHECK: vld1.16 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define <8 x i16> @test_vld1q_dup_p16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
+// CHECK:   ret <8 x i16> [[LANE]]
 poly16x8_t test_vld1q_dup_p16(poly16_t const * a) {
   return vld1q_dup_p16(a);
 }
 
-// CHECK-LABEL: test_vld1_dup_u8
-// CHECK: vld1.8 {d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <8 x i8> @test_vld1_dup_u8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK:   ret <8 x i8> [[LANE]]
 uint8x8_t test_vld1_dup_u8(uint8_t const * a) {
   return vld1_dup_u8(a);
 }
 
-// CHECK-LABEL: test_vld1_dup_u16
-// CHECK: vld1.16 {d{{[0-9]+}}[]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define <4 x i16> @test_vld1_dup_u16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   ret <4 x i16> [[LANE]]
 uint16x4_t test_vld1_dup_u16(uint16_t const * a) {
   return vld1_dup_u16(a);
 }
 
-// CHECK-LABEL: test_vld1_dup_u32
-// CHECK: vld1.32 {d{{[0-9]+}}[]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: define <2 x i32> @test_vld1_dup_u32(i32* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP2:%.*]] = load i32, i32* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   ret <2 x i32> [[LANE]]
 uint32x2_t test_vld1_dup_u32(uint32_t const * a) {
   return vld1_dup_u32(a);
 }
 
-// CHECK-LABEL: test_vld1_dup_u64
-// CHECK: {{ldr|vldr|vmov}}
+// CHECK-LABEL: define <1 x i64> @test_vld1_dup_u64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[LANE]]
 uint64x1_t test_vld1_dup_u64(uint64_t const * a) {
   return vld1_dup_u64(a);
 }
 
-// CHECK-LABEL: test_vld1_dup_s8
-// CHECK: vld1.8 {d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <8 x i8> @test_vld1_dup_s8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK:   ret <8 x i8> [[LANE]]
 int8x8_t test_vld1_dup_s8(int8_t const * a) {
   return vld1_dup_s8(a);
 }
 
-// CHECK-LABEL: test_vld1_dup_s16
-// CHECK: vld1.16 {d{{[0-9]+}}[]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define <4 x i16> @test_vld1_dup_s16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   ret <4 x i16> [[LANE]]
 int16x4_t test_vld1_dup_s16(int16_t const * a) {
   return vld1_dup_s16(a);
 }
 
-// CHECK-LABEL: test_vld1_dup_s32
-// CHECK: vld1.32 {d{{[0-9]+}}[]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: define <2 x i32> @test_vld1_dup_s32(i32* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP2:%.*]] = load i32, i32* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   ret <2 x i32> [[LANE]]
 int32x2_t test_vld1_dup_s32(int32_t const * a) {
   return vld1_dup_s32(a);
 }
 
-// CHECK-LABEL: test_vld1_dup_s64
-// CHECK: {{ldr|vldr|vmov}}
+// CHECK-LABEL: define <1 x i64> @test_vld1_dup_s64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[LANE]]
 int64x1_t test_vld1_dup_s64(int64_t const * a) {
   return vld1_dup_s64(a);
 }
 
-// CHECK-LABEL: test_vld1_dup_f16
-// CHECK: vld1.16 {d{{[0-9]+}}[]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define <4 x half> @test_vld1_dup_f16(half* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <4 x half>
+// CHECK:   ret <4 x half> [[TMP4]]
 float16x4_t test_vld1_dup_f16(float16_t const * a) {
   return vld1_dup_f16(a);
 }
 
-// CHECK-LABEL: test_vld1_dup_f32
-// CHECK: vld1.32 {d{{[0-9]+}}[]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: define <2 x float> @test_vld1_dup_f32(float* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to float*
+// CHECK:   [[TMP2:%.*]] = load float, float* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   ret <2 x float> [[LANE]]
 float32x2_t test_vld1_dup_f32(float32_t const * a) {
   return vld1_dup_f32(a);
 }
 
-// CHECK-LABEL: test_vld1_dup_p8
-// CHECK: vld1.8 {d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <8 x i8> @test_vld1_dup_p8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK:   ret <8 x i8> [[LANE]]
 poly8x8_t test_vld1_dup_p8(poly8_t const * a) {
   return vld1_dup_p8(a);
 }
 
-// CHECK-LABEL: test_vld1_dup_p16
-// CHECK: vld1.16 {d{{[0-9]+}}[]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define <4 x i16> @test_vld1_dup_p16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   ret <4 x i16> [[LANE]]
 poly16x4_t test_vld1_dup_p16(poly16_t const * a) {
   return vld1_dup_p16(a);
 }
 
 
-// CHECK-LABEL: test_vld1q_lane_u8
-// CHECK: vld1.8 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <16 x i8> @test_vld1q_lane_u8(i8* %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
+// CHECK:   ret <16 x i8> [[VLD1_LANE]]
 uint8x16_t test_vld1q_lane_u8(uint8_t const * a, uint8x16_t b) {
   return vld1q_lane_u8(a, b, 15);
 }
 
-// CHECK-LABEL: test_vld1q_lane_u16
-// CHECK: vld1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define <8 x i16> @test_vld1q_lane_u16(i16* %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
+// CHECK:   ret <8 x i16> [[VLD1_LANE]]
 uint16x8_t test_vld1q_lane_u16(uint16_t const * a, uint16x8_t b) {
   return vld1q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld1q_lane_u32
-// CHECK: vld1.32 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: define <4 x i32> @test_vld1q_lane_u32(i32* %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP4:%.*]] = load i32, i32* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i32 3
+// CHECK:   ret <4 x i32> [[VLD1_LANE]]
 uint32x4_t test_vld1q_lane_u32(uint32_t const * a, uint32x4_t b) {
   return vld1q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld1q_lane_u64
-// CHECK: {{ldr|vldr|vmov}}
+// CHECK-LABEL: define <2 x i64> @test_vld1q_lane_u64(i64* %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP4:%.*]] = load i64, i64* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP4]], i32 1
+// CHECK:   ret <2 x i64> [[VLD1_LANE]]
 uint64x2_t test_vld1q_lane_u64(uint64_t const * a, uint64x2_t b) {
   return vld1q_lane_u64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld1q_lane_s8
-// CHECK: vld1.8 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <16 x i8> @test_vld1q_lane_s8(i8* %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
+// CHECK:   ret <16 x i8> [[VLD1_LANE]]
 int8x16_t test_vld1q_lane_s8(int8_t const * a, int8x16_t b) {
   return vld1q_lane_s8(a, b, 15);
 }
 
-// CHECK-LABEL: test_vld1q_lane_s16
-// CHECK: vld1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define <8 x i16> @test_vld1q_lane_s16(i16* %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
+// CHECK:   ret <8 x i16> [[VLD1_LANE]]
 int16x8_t test_vld1q_lane_s16(int16_t const * a, int16x8_t b) {
   return vld1q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld1q_lane_s32
-// CHECK: vld1.32 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: define <4 x i32> @test_vld1q_lane_s32(i32* %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP4:%.*]] = load i32, i32* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i32 3
+// CHECK:   ret <4 x i32> [[VLD1_LANE]]
 int32x4_t test_vld1q_lane_s32(int32_t const * a, int32x4_t b) {
   return vld1q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld1q_lane_s64
-// CHECK: {{ldr|vldr|vmov}}
+// CHECK-LABEL: define <2 x i64> @test_vld1q_lane_s64(i64* %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP4:%.*]] = load i64, i64* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP4]], i32 1
+// CHECK:   ret <2 x i64> [[VLD1_LANE]]
 int64x2_t test_vld1q_lane_s64(int64_t const * a, int64x2_t b) {
   return vld1q_lane_s64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld1q_lane_f16
-// CHECK: vld1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define <8 x half> @test_vld1q_lane_f16(half* %a, <8 x half> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[VLD1_LANE]] to <8 x half>
+// CHECK:   ret <8 x half> [[TMP5]]
 float16x8_t test_vld1q_lane_f16(float16_t const * a, float16x8_t b) {
   return vld1q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld1q_lane_f32
-// CHECK: vld1.32 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: define <4 x float> @test_vld1q_lane_f32(float* %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to float*
+// CHECK:   [[TMP4:%.*]] = load float, float* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP4]], i32 3
+// CHECK:   ret <4 x float> [[VLD1_LANE]]
 float32x4_t test_vld1q_lane_f32(float32_t const * a, float32x4_t b) {
   return vld1q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld1q_lane_p8
-// CHECK: vld1.8 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <16 x i8> @test_vld1q_lane_p8(i8* %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
+// CHECK:   ret <16 x i8> [[VLD1_LANE]]
 poly8x16_t test_vld1q_lane_p8(poly8_t const * a, poly8x16_t b) {
   return vld1q_lane_p8(a, b, 15);
 }
 
-// CHECK-LABEL: test_vld1q_lane_p16
-// CHECK: vld1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define <8 x i16> @test_vld1q_lane_p16(i16* %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
+// CHECK:   ret <8 x i16> [[VLD1_LANE]]
 poly16x8_t test_vld1q_lane_p16(poly16_t const * a, poly16x8_t b) {
   return vld1q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld1_lane_u8
-// CHECK: vld1.8 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <8 x i8> @test_vld1_lane_u8(i8* %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
+// CHECK:   ret <8 x i8> [[VLD1_LANE]]
 uint8x8_t test_vld1_lane_u8(uint8_t const * a, uint8x8_t b) {
   return vld1_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld1_lane_u16
-// CHECK: vld1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define <4 x i16> @test_vld1_lane_u16(i16* %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
+// CHECK:   ret <4 x i16> [[VLD1_LANE]]
 uint16x4_t test_vld1_lane_u16(uint16_t const * a, uint16x4_t b) {
   return vld1_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld1_lane_u32
-// CHECK: vld1.32 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: define <2 x i32> @test_vld1_lane_u32(i32* %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP4:%.*]] = load i32, i32* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i32 1
+// CHECK:   ret <2 x i32> [[VLD1_LANE]]
 uint32x2_t test_vld1_lane_u32(uint32_t const * a, uint32x2_t b) {
   return vld1_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld1_lane_u64
-// CHECK: {{ldr|vldr|vmov}}
+// CHECK-LABEL: define <1 x i64> @test_vld1_lane_u64(i64* %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP4:%.*]] = load i64, i64* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0
+// CHECK:   ret <1 x i64> [[VLD1_LANE]]
 uint64x1_t test_vld1_lane_u64(uint64_t const * a, uint64x1_t b) {
   return vld1_lane_u64(a, b, 0);
 }
 
-// CHECK-LABEL: test_vld1_lane_s8
-// CHECK: vld1.8 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <8 x i8> @test_vld1_lane_s8(i8* %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
+// CHECK:   ret <8 x i8> [[VLD1_LANE]]
 int8x8_t test_vld1_lane_s8(int8_t const * a, int8x8_t b) {
   return vld1_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld1_lane_s16
-// CHECK: vld1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define <4 x i16> @test_vld1_lane_s16(i16* %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
+// CHECK:   ret <4 x i16> [[VLD1_LANE]]
 int16x4_t test_vld1_lane_s16(int16_t const * a, int16x4_t b) {
   return vld1_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld1_lane_s32
-// CHECK: vld1.32 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: define <2 x i32> @test_vld1_lane_s32(i32* %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP4:%.*]] = load i32, i32* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i32 1
+// CHECK:   ret <2 x i32> [[VLD1_LANE]]
 int32x2_t test_vld1_lane_s32(int32_t const * a, int32x2_t b) {
   return vld1_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld1_lane_s64
-// CHECK: {{ldr|vldr|vmov}}
+// CHECK-LABEL: define <1 x i64> @test_vld1_lane_s64(i64* %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP4:%.*]] = load i64, i64* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0
+// CHECK:   ret <1 x i64> [[VLD1_LANE]]
 int64x1_t test_vld1_lane_s64(int64_t const * a, int64x1_t b) {
   return vld1_lane_s64(a, b, 0);
 }
 
-// CHECK-LABEL: test_vld1_lane_f16
-// CHECK: vld1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define <4 x half> @test_vld1_lane_f16(half* %a, <4 x half> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[VLD1_LANE]] to <4 x half>
+// CHECK:   ret <4 x half> [[TMP5]]
 float16x4_t test_vld1_lane_f16(float16_t const * a, float16x4_t b) {
   return vld1_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld1_lane_f32
-// CHECK: vld1.32 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: define <2 x float> @test_vld1_lane_f32(float* %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to float*
+// CHECK:   [[TMP4:%.*]] = load float, float* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP4]], i32 1
+// CHECK:   ret <2 x float> [[VLD1_LANE]]
 float32x2_t test_vld1_lane_f32(float32_t const * a, float32x2_t b) {
   return vld1_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld1_lane_p8
-// CHECK: vld1.8 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <8 x i8> @test_vld1_lane_p8(i8* %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
+// CHECK:   ret <8 x i8> [[VLD1_LANE]]
 poly8x8_t test_vld1_lane_p8(poly8_t const * a, poly8x8_t b) {
   return vld1_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld1_lane_p16
-// CHECK: vld1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define <4 x i16> @test_vld1_lane_p16(i16* %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
+// CHECK:   ret <4 x i16> [[VLD1_LANE]]
 poly16x4_t test_vld1_lane_p16(poly16_t const * a, poly16x4_t b) {
   return vld1_lane_p16(a, b, 3);
 }
 
 
-// CHECK-LABEL: test_vld2q_u8
-// CHECK: vld2.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint8x16x2_t @test_vld2q_u8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0v16i8(<16 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x2_t [[TMP5]]
 uint8x16x2_t test_vld2q_u8(uint8_t const * a) {
   return vld2q_u8(a);
 }
 
-// CHECK-LABEL: test_vld2q_u16
-// CHECK: vld2.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint16x8x2_t @test_vld2q_u16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x2_t [[TMP6]]
 uint16x8x2_t test_vld2q_u16(uint16_t const * a) {
   return vld2q_u16(a);
 }
 
-// CHECK-LABEL: test_vld2q_u32
-// CHECK: vld2.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint32x4x2_t @test_vld2q_u32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i32>*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2]], { <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x2_t [[TMP6]]
 uint32x4x2_t test_vld2q_u32(uint32_t const * a) {
   return vld2q_u32(a);
 }
 
-// CHECK-LABEL: test_vld2q_s8
-// CHECK: vld2.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int8x16x2_t @test_vld2q_s8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0v16i8(<16 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x2_t [[TMP5]]
 int8x16x2_t test_vld2q_s8(int8_t const * a) {
   return vld2q_s8(a);
 }
 
-// CHECK-LABEL: test_vld2q_s16
-// CHECK: vld2.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int16x8x2_t @test_vld2q_s16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x2_t [[TMP6]]
 int16x8x2_t test_vld2q_s16(int16_t const * a) {
   return vld2q_s16(a);
 }
 
-// CHECK-LABEL: test_vld2q_s32
-// CHECK: vld2.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int32x4x2_t @test_vld2q_s32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i32>*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2]], { <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x2_t [[TMP6]]
 int32x4x2_t test_vld2q_s32(int32_t const * a) {
   return vld2q_s32(a);
 }
 
-// CHECK-LABEL: test_vld2q_f16
-// CHECK: vld2.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.float16x8x2_t @test_vld2q_f16(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x8x2_t, %struct.float16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float16x8x2_t [[TMP6]]
 float16x8x2_t test_vld2q_f16(float16_t const * a) {
   return vld2q_f16(a);
 }
 
-// CHECK-LABEL: test_vld2q_f32
-// CHECK: vld2.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.float32x4x2_t @test_vld2q_f32(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x float>*
+// CHECK:   [[VLD2:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0v4f32(<4 x float>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float> } [[VLD2]], { <4 x float>, <4 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x2_t [[TMP6]]
 float32x4x2_t test_vld2q_f32(float32_t const * a) {
   return vld2q_f32(a);
 }
 
-// CHECK-LABEL: test_vld2q_p8
-// CHECK: vld2.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.poly8x16x2_t @test_vld2q_p8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0v16i8(<16 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x2_t [[TMP5]]
 poly8x16x2_t test_vld2q_p8(poly8_t const * a) {
   return vld2q_p8(a);
 }
 
-// CHECK-LABEL: test_vld2q_p16
-// CHECK: vld2.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.poly16x8x2_t @test_vld2q_p16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x2_t [[TMP6]]
 poly16x8x2_t test_vld2q_p16(poly16_t const * a) {
   return vld2q_p16(a);
 }
 
-// CHECK-LABEL: test_vld2_u8
-// CHECK: vld2.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint8x8x2_t @test_vld2_u8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x2_t [[TMP5]]
 uint8x8x2_t test_vld2_u8(uint8_t const * a) {
   return vld2_u8(a);
 }
 
-// CHECK-LABEL: test_vld2_u16
-// CHECK: vld2.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint16x4x2_t @test_vld2_u16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x2_t [[TMP6]]
 uint16x4x2_t test_vld2_u16(uint16_t const * a) {
   return vld2_u16(a);
 }
 
-// CHECK-LABEL: test_vld2_u32
-// CHECK: vld2.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint32x2x2_t @test_vld2_u32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i32>*
+// CHECK:   [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0v2i32(<2 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2]], { <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x2_t [[TMP6]]
 uint32x2x2_t test_vld2_u32(uint32_t const * a) {
   return vld2_u32(a);
 }
 
-// CHECK-LABEL: test_vld2_u64
-// CHECK: vld1.64
+// CHECK-LABEL: define %struct.uint64x1x2_t @test_vld2_u64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>*
+// CHECK:   [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0v1i64(<1 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint64x1x2_t [[TMP6]]
 uint64x1x2_t test_vld2_u64(uint64_t const * a) {
   return vld2_u64(a);
 }
 
-// CHECK-LABEL: test_vld2_s8
-// CHECK: vld2.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int8x8x2_t @test_vld2_s8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x2_t [[TMP5]]
 int8x8x2_t test_vld2_s8(int8_t const * a) {
   return vld2_s8(a);
 }
 
-// CHECK-LABEL: test_vld2_s16
-// CHECK: vld2.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int16x4x2_t @test_vld2_s16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x2_t [[TMP6]]
 int16x4x2_t test_vld2_s16(int16_t const * a) {
   return vld2_s16(a);
 }
 
-// CHECK-LABEL: test_vld2_s32
-// CHECK: vld2.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int32x2x2_t @test_vld2_s32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i32>*
+// CHECK:   [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0v2i32(<2 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2]], { <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x2_t [[TMP6]]
 int32x2x2_t test_vld2_s32(int32_t const * a) {
   return vld2_s32(a);
 }
 
-// CHECK-LABEL: test_vld2_s64
-// CHECK: vld1.64
+// CHECK-LABEL: define %struct.int64x1x2_t @test_vld2_s64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>*
+// CHECK:   [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0v1i64(<1 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x1x2_t, %struct.int64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int64x1x2_t [[TMP6]]
 int64x1x2_t test_vld2_s64(int64_t const * a) {
   return vld2_s64(a);
 }
 
-// CHECK-LABEL: test_vld2_f16
-// CHECK: vld2.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.float16x4x2_t @test_vld2_f16(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x4x2_t, %struct.float16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x2_t [[TMP6]]
 float16x4x2_t test_vld2_f16(float16_t const * a) {
   return vld2_f16(a);
 }
 
-// CHECK-LABEL: test_vld2_f32
-// CHECK: vld2.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.float32x2x2_t @test_vld2_f32(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x float>*
+// CHECK:   [[VLD2:%.*]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2.v2f32.p0v2f32(<2 x float>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float> } [[VLD2]], { <2 x float>, <2 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x2_t [[TMP6]]
 float32x2x2_t test_vld2_f32(float32_t const * a) {
   return vld2_f32(a);
 }
 
-// CHECK-LABEL: test_vld2_p8
-// CHECK: vld2.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.poly8x8x2_t @test_vld2_p8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x2_t [[TMP5]]
 poly8x8x2_t test_vld2_p8(poly8_t const * a) {
   return vld2_p8(a);
 }
 
-// CHECK-LABEL: test_vld2_p16
-// CHECK: vld2.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.poly16x4x2_t @test_vld2_p16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x2_t [[TMP6]]
 poly16x4x2_t test_vld2_p16(poly16_t const * a) {
   return vld2_p16(a);
 }
 
 
-// CHECK-LABEL: test_vld2_dup_u8
-// CHECK: vld2.8 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint8x8x2_t @test_vld2_dup_u8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x2_t [[TMP4]]
 uint8x8x2_t test_vld2_dup_u8(uint8_t const * a) {
   return vld2_dup_u8(a);
 }
 
-// CHECK-LABEL: test_vld2_dup_u16
-// CHECK: vld2.16 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint16x4x2_t @test_vld2_dup_u16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x2_t [[TMP6]]
 uint16x4x2_t test_vld2_dup_u16(uint16_t const * a) {
   return vld2_dup_u16(a);
 }
 
-// CHECK-LABEL: test_vld2_dup_u32
-// CHECK: vld2.32 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint32x2x2_t @test_vld2_dup_u32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2r.v2i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2]], { <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x2_t [[TMP6]]
 uint32x2x2_t test_vld2_dup_u32(uint32_t const * a) {
   return vld2_dup_u32(a);
 }
 
-// CHECK-LABEL: test_vld2_dup_u64
-// CHECK: vld1.64
+// CHECK-LABEL: define %struct.uint64x1x2_t @test_vld2_dup_u64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint64x1x2_t [[TMP6]]
 uint64x1x2_t test_vld2_dup_u64(uint64_t const * a) {
   return vld2_dup_u64(a);
 }
 
-// CHECK-LABEL: test_vld2_dup_s8
-// CHECK: vld2.8 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int8x8x2_t @test_vld2_dup_s8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x2_t [[TMP4]]
 int8x8x2_t test_vld2_dup_s8(int8_t const * a) {
   return vld2_dup_s8(a);
 }
 
-// CHECK-LABEL: test_vld2_dup_s16
-// CHECK: vld2.16 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int16x4x2_t @test_vld2_dup_s16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x2_t [[TMP6]]
 int16x4x2_t test_vld2_dup_s16(int16_t const * a) {
   return vld2_dup_s16(a);
 }
 
-// CHECK-LABEL: test_vld2_dup_s32
-// CHECK: vld2.32 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int32x2x2_t @test_vld2_dup_s32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2r.v2i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2]], { <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x2_t [[TMP6]]
 int32x2x2_t test_vld2_dup_s32(int32_t const * a) {
   return vld2_dup_s32(a);
 }
 
-// CHECK-LABEL: test_vld2_dup_s64
-// CHECK: vld1.64
+// CHECK-LABEL: define %struct.int64x1x2_t @test_vld2_dup_s64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x1x2_t, %struct.int64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int64x1x2_t [[TMP6]]
 int64x1x2_t test_vld2_dup_s64(int64_t const * a) {
   return vld2_dup_s64(a);
 }
 
-// CHECK-LABEL: test_vld2_dup_f16
-// CHECK: vld2.16 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.float16x4x2_t @test_vld2_dup_f16(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x4x2_t, %struct.float16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x2_t [[TMP6]]
 float16x4x2_t test_vld2_dup_f16(float16_t const * a) {
   return vld2_dup_f16(a);
 }
 
-// CHECK-LABEL: test_vld2_dup_f32
-// CHECK: vld2.32 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.float32x2x2_t @test_vld2_dup_f32(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK:   [[VLD2:%.*]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2r.v2f32.p0f32(float* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float> } [[VLD2]], { <2 x float>, <2 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x2_t [[TMP6]]
 float32x2x2_t test_vld2_dup_f32(float32_t const * a) {
   return vld2_dup_f32(a);
 }
 
-// CHECK-LABEL: test_vld2_dup_p8
-// CHECK: vld2.8 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.poly8x8x2_t @test_vld2_dup_p8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x2_t [[TMP4]]
 poly8x8x2_t test_vld2_dup_p8(poly8_t const * a) {
   return vld2_dup_p8(a);
 }
 
-// CHECK-LABEL: test_vld2_dup_p16
-// CHECK: vld2.16 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.poly16x4x2_t @test_vld2_dup_p16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x2_t [[TMP6]]
 poly16x4x2_t test_vld2_dup_p16(poly16_t const * a) {
   return vld2_dup_p16(a);
 }
 
 
-// CHECK-LABEL: test_vld2q_lane_u16
-// CHECK: vld2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint16x8x2_t @test_vld2q_lane_u16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i8(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], { <8 x i16>, <8 x i16> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x2_t [[TMP1]]3
 uint16x8x2_t test_vld2q_lane_u16(uint16_t const * a, uint16x8x2_t b) {
   return vld2q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld2q_lane_u32
-// CHECK: vld2.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint32x4x2_t @test_vld2q_lane_u32(i32* %a, [2 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i8(<4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2_LANE]], { <4 x i32>, <4 x i32> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x2_t [[TMP1]]3
 uint32x4x2_t test_vld2q_lane_u32(uint32_t const * a, uint32x4x2_t b) {
   return vld2q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld2q_lane_s16
-// CHECK: vld2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int16x8x2_t @test_vld2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i8(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], { <8 x i16>, <8 x i16> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x2_t [[TMP1]]3
 int16x8x2_t test_vld2q_lane_s16(int16_t const * a, int16x8x2_t b) {
   return vld2q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld2q_lane_s32
-// CHECK: vld2.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int32x4x2_t @test_vld2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i8(<4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2_LANE]], { <4 x i32>, <4 x i32> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x2_t [[TMP1]]3
 int32x4x2_t test_vld2q_lane_s32(int32_t const * a, int32x4x2_t b) {
   return vld2q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld2q_lane_f16
-// CHECK: vld2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.float16x8x2_t @test_vld2q_lane_f16(half* %a, [2 x <8 x half>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x half>] [[B]].coerce, [2 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i8(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], { <8 x i16>, <8 x i16> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.float16x8x2_t, %struct.float16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float16x8x2_t [[TMP1]]3
 float16x8x2_t test_vld2q_lane_f16(float16_t const * a, float16x8x2_t b) {
   return vld2q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld2q_lane_f32
-// CHECK: vld2.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.float32x4x2_t @test_vld2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x float>] [[B]].coerce, [2 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2lane.v4f32.p0i8(<4 x float> [[TMP8]], <4 x float> [[TMP9]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float> } [[VLD2_LANE]], { <4 x float>, <4 x float> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x2_t [[TMP1]]3
 float32x4x2_t test_vld2q_lane_f32(float32_t const * a, float32x4x2_t b) {
   return vld2q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld2q_lane_p16
-// CHECK: vld2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.poly16x8x2_t @test_vld2q_lane_p16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i8(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], { <8 x i16>, <8 x i16> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x2_t [[TMP1]]3
 poly16x8x2_t test_vld2q_lane_p16(poly16_t const * a, poly16x8x2_t b) {
   return vld2q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld2_lane_u8
-// CHECK: vld2.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint8x8x2_t @test_vld2_lane_u8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* %a)
+// CHECK:   [[TMP5:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], { <8 x i8>, <8 x i8> }* [[TMP5]]
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP8:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x2_t [[TMP8]]
 uint8x8x2_t test_vld2_lane_u8(uint8_t const * a, uint8x8x2_t b) {
   return vld2_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld2_lane_u16
-// CHECK: vld2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint16x4x2_t @test_vld2_lane_u16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i8(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], { <4 x i16>, <4 x i16> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x2_t [[TMP1]]3
 uint16x4x2_t test_vld2_lane_u16(uint16_t const * a, uint16x4x2_t b) {
   return vld2_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld2_lane_u32
-// CHECK: vld2.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint32x2x2_t @test_vld2_lane_u32(i32* %a, [2 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0i8(<2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2_LANE]], { <2 x i32>, <2 x i32> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x2_t [[TMP1]]3
 uint32x2x2_t test_vld2_lane_u32(uint32_t const * a, uint32x2x2_t b) {
   return vld2_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld2_lane_s8
-// CHECK: vld2.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int8x8x2_t @test_vld2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* %a)
+// CHECK:   [[TMP5:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], { <8 x i8>, <8 x i8> }* [[TMP5]]
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP8:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x2_t [[TMP8]]
 int8x8x2_t test_vld2_lane_s8(int8_t const * a, int8x8x2_t b) {
   return vld2_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld2_lane_s16
-// CHECK: vld2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int16x4x2_t @test_vld2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i8(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], { <4 x i16>, <4 x i16> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x2_t [[TMP1]]3
 int16x4x2_t test_vld2_lane_s16(int16_t const * a, int16x4x2_t b) {
   return vld2_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld2_lane_s32
-// CHECK: vld2.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int32x2x2_t @test_vld2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0i8(<2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2_LANE]], { <2 x i32>, <2 x i32> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x2_t [[TMP1]]3
 int32x2x2_t test_vld2_lane_s32(int32_t const * a, int32x2x2_t b) {
   return vld2_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld2_lane_f16
-// CHECK: vld2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.float16x4x2_t @test_vld2_lane_f16(half* %a, [2 x <4 x half>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x half>] [[B]].coerce, [2 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i8(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], { <4 x i16>, <4 x i16> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.float16x4x2_t, %struct.float16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x2_t [[TMP1]]3
 float16x4x2_t test_vld2_lane_f16(float16_t const * a, float16x4x2_t b) {
   return vld2_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld2_lane_f32
-// CHECK: vld2.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.float32x2x2_t @test_vld2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x float>] [[B]].coerce, [2 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2lane.v2f32.p0i8(<2 x float> [[TMP8]], <2 x float> [[TMP9]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float> } [[VLD2_LANE]], { <2 x float>, <2 x float> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x2_t [[TMP1]]3
 float32x2x2_t test_vld2_lane_f32(float32_t const * a, float32x2x2_t b) {
   return vld2_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld2_lane_p8
-// CHECK: vld2.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.poly8x8x2_t @test_vld2_lane_p8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* %a)
+// CHECK:   [[TMP5:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], { <8 x i8>, <8 x i8> }* [[TMP5]]
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP8:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x2_t [[TMP8]]
 poly8x8x2_t test_vld2_lane_p8(poly8_t const * a, poly8x8x2_t b) {
   return vld2_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld2_lane_p16
-// CHECK: vld2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.poly16x4x2_t @test_vld2_lane_p16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i8(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], { <4 x i16>, <4 x i16> }* [[TMP1]]0
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]1, i8* [[TMP1]]2, i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x2_t [[TMP1]]3
 poly16x4x2_t test_vld2_lane_p16(poly16_t const * a, poly16x4x2_t b) {
   return vld2_lane_p16(a, b, 3);
 }
 
 
-// CHECK-LABEL: test_vld3q_u8
-// CHECK: vld3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define %struct.uint8x16x3_t @test_vld3q_u8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0v16i8(<16 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x3_t [[TMP5]]
 uint8x16x3_t test_vld3q_u8(uint8_t const * a) {
   return vld3q_u8(a);
 }
 
-// CHECK-LABEL: test_vld3q_u16
-// CHECK: vld3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define %struct.uint16x8x3_t @test_vld3q_u16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x3_t [[TMP6]]
 uint16x8x3_t test_vld3q_u16(uint16_t const * a) {
   return vld3q_u16(a);
 }
 
-// CHECK-LABEL: test_vld3q_u32
-// CHECK: vld3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define %struct.uint32x4x3_t @test_vld3q_u32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i32>*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x3_t [[TMP6]]
 uint32x4x3_t test_vld3q_u32(uint32_t const * a) {
   return vld3q_u32(a);
 }
 
-// CHECK-LABEL: test_vld3q_s8
-// CHECK: vld3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define %struct.int8x16x3_t @test_vld3q_s8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0v16i8(<16 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x16x3_t, %struct.int8x16x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x3_t [[TMP5]]
 int8x16x3_t test_vld3q_s8(int8_t const * a) {
   return vld3q_s8(a);
 }
 
-// CHECK-LABEL: test_vld3q_s16
-// CHECK: vld3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define %struct.int16x8x3_t @test_vld3q_s16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x8x3_t, %struct.int16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x3_t [[TMP6]]
 int16x8x3_t test_vld3q_s16(int16_t const * a) {
   return vld3q_s16(a);
 }
 
-// CHECK-LABEL: test_vld3q_s32
-// CHECK: vld3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define %struct.int32x4x3_t @test_vld3q_s32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i32>*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x4x3_t, %struct.int32x4x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x3_t [[TMP6]]
 int32x4x3_t test_vld3q_s32(int32_t const * a) {
   return vld3q_s32(a);
 }
 
-// CHECK-LABEL: test_vld3q_f16
-// CHECK: vld3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define %struct.float16x8x3_t @test_vld3q_f16(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x8x3_t, %struct.float16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float16x8x3_t [[TMP6]]
 float16x8x3_t test_vld3q_f16(float16_t const * a) {
   return vld3q_f16(a);
 }
 
-// CHECK-LABEL: test_vld3q_f32
-// CHECK: vld3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define %struct.float32x4x3_t @test_vld3q_f32(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x float>*
+// CHECK:   [[VLD3:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3.v4f32.p0v4f32(<4 x float>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float>, <4 x float> } [[VLD3]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x4x3_t, %struct.float32x4x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x3_t [[TMP6]]
 float32x4x3_t test_vld3q_f32(float32_t const * a) {
   return vld3q_f32(a);
 }
 
-// CHECK-LABEL: test_vld3q_p8
-// CHECK: vld3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define %struct.poly8x16x3_t @test_vld3q_p8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0v16i8(<16 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x3_t [[TMP5]]
 poly8x16x3_t test_vld3q_p8(poly8_t const * a) {
   return vld3q_p8(a);
 }
 
-// CHECK-LABEL: test_vld3q_p16
-// CHECK: vld3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define %struct.poly16x8x3_t @test_vld3q_p16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x3_t [[TMP6]]
 poly16x8x3_t test_vld3q_p16(poly16_t const * a) {
   return vld3q_p16(a);
 }
 
-// CHECK-LABEL: test_vld3_u8
-// CHECK: vld3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint8x8x3_t @test_vld3_u8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x3_t [[TMP5]]
 uint8x8x3_t test_vld3_u8(uint8_t const * a) {
   return vld3_u8(a);
 }
 
-// CHECK-LABEL: test_vld3_u16
-// CHECK: vld3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint16x4x3_t @test_vld3_u16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x3_t [[TMP6]]
 uint16x4x3_t test_vld3_u16(uint16_t const * a) {
   return vld3_u16(a);
 }
 
-// CHECK-LABEL: test_vld3_u32
-// CHECK: vld3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint32x2x3_t @test_vld3_u32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i32>*
+// CHECK:   [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3.v2i32.p0v2i32(<2 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x3_t [[TMP6]]
 uint32x2x3_t test_vld3_u32(uint32_t const * a) {
   return vld3_u32(a);
 }
 
-// CHECK-LABEL: test_vld3_u64
-// CHECK: vld1.64
+// CHECK-LABEL: define %struct.uint64x1x3_t @test_vld3_u64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>*
+// CHECK:   [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0v1i64(<1 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint64x1x3_t [[TMP6]]
 uint64x1x3_t test_vld3_u64(uint64_t const * a) {
   return vld3_u64(a);
 }
 
-// CHECK-LABEL: test_vld3_s8
-// CHECK: vld3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int8x8x3_t @test_vld3_s8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x8x3_t, %struct.int8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x3_t [[TMP5]]
 int8x8x3_t test_vld3_s8(int8_t const * a) {
   return vld3_s8(a);
 }
 
-// CHECK-LABEL: test_vld3_s16
-// CHECK: vld3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int16x4x3_t @test_vld3_s16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x4x3_t, %struct.int16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x3_t [[TMP6]]
 int16x4x3_t test_vld3_s16(int16_t const * a) {
   return vld3_s16(a);
 }
 
-// CHECK-LABEL: test_vld3_s32
-// CHECK: vld3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int32x2x3_t @test_vld3_s32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i32>*
+// CHECK:   [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3.v2i32.p0v2i32(<2 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x2x3_t, %struct.int32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x3_t [[TMP6]]
 int32x2x3_t test_vld3_s32(int32_t const * a) {
   return vld3_s32(a);
 }
 
-// CHECK-LABEL: test_vld3_s64
-// CHECK: vld1.64
+// CHECK-LABEL: define %struct.int64x1x3_t @test_vld3_s64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>*
+// CHECK:   [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0v1i64(<1 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x1x3_t, %struct.int64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int64x1x3_t [[TMP6]]
 int64x1x3_t test_vld3_s64(int64_t const * a) {
   return vld3_s64(a);
 }
 
-// CHECK-LABEL: test_vld3_f16
-// CHECK: vld3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.float16x4x3_t @test_vld3_f16(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x4x3_t, %struct.float16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x3_t [[TMP6]]
 float16x4x3_t test_vld3_f16(float16_t const * a) {
   return vld3_f16(a);
 }
 
-// CHECK-LABEL: test_vld3_f32
-// CHECK: vld3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.float32x2x3_t @test_vld3_f32(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x float>*
+// CHECK:   [[VLD3:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3.v2f32.p0v2f32(<2 x float>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float>, <2 x float> } [[VLD3]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x2x3_t, %struct.float32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x3_t [[TMP6]]
 float32x2x3_t test_vld3_f32(float32_t const * a) {
   return vld3_f32(a);
 }
 
-// CHECK-LABEL: test_vld3_p8
-// CHECK: vld3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.poly8x8x3_t @test_vld3_p8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x3_t [[TMP5]]
 poly8x8x3_t test_vld3_p8(poly8_t const * a) {
   return vld3_p8(a);
 }
 
-// CHECK-LABEL: test_vld3_p16
-// CHECK: vld3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.poly16x4x3_t @test_vld3_p16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x3_t [[TMP6]]
 poly16x4x3_t test_vld3_p16(poly16_t const * a) {
   return vld3_p16(a);
 }
 
 
-// CHECK-LABEL: test_vld3_dup_u8
-// CHECK: vld3.8 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint8x8x3_t @test_vld3_dup_u8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x3_t [[TMP4]]
 uint8x8x3_t test_vld3_dup_u8(uint8_t const * a) {
   return vld3_dup_u8(a);
 }
 
-// CHECK-LABEL: test_vld3_dup_u16
-// CHECK: vld3.16 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint16x4x3_t @test_vld3_dup_u16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x3_t [[TMP6]]
 uint16x4x3_t test_vld3_dup_u16(uint16_t const * a) {
   return vld3_dup_u16(a);
 }
 
-// CHECK-LABEL: test_vld3_dup_u32
-// CHECK: vld3.32 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint32x2x3_t @test_vld3_dup_u32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3r.v2i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x3_t [[TMP6]]
 uint32x2x3_t test_vld3_dup_u32(uint32_t const * a) {
   return vld3_dup_u32(a);
 }
 
-// CHECK-LABEL: test_vld3_dup_u64
-// CHECK: vld1.64
+// CHECK-LABEL: define %struct.uint64x1x3_t @test_vld3_dup_u64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint64x1x3_t [[TMP6]]
 uint64x1x3_t test_vld3_dup_u64(uint64_t const * a) {
   return vld3_dup_u64(a);
 }
 
-// CHECK-LABEL: test_vld3_dup_s8
-// CHECK: vld3.8 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int8x8x3_t @test_vld3_dup_s8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.int8x8x3_t, %struct.int8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x3_t [[TMP4]]
 int8x8x3_t test_vld3_dup_s8(int8_t const * a) {
   return vld3_dup_s8(a);
 }
 
-// CHECK-LABEL: test_vld3_dup_s16
-// CHECK: vld3.16 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int16x4x3_t @test_vld3_dup_s16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x4x3_t, %struct.int16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x3_t [[TMP6]]
 int16x4x3_t test_vld3_dup_s16(int16_t const * a) {
   return vld3_dup_s16(a);
 }
 
-// CHECK-LABEL: test_vld3_dup_s32
-// CHECK: vld3.32 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int32x2x3_t @test_vld3_dup_s32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3r.v2i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x2x3_t, %struct.int32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x3_t [[TMP6]]
 int32x2x3_t test_vld3_dup_s32(int32_t const * a) {
   return vld3_dup_s32(a);
 }
 
-// CHECK-LABEL: test_vld3_dup_s64
-// CHECK: vld1.64
+// CHECK-LABEL: define %struct.int64x1x3_t @test_vld3_dup_s64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x1x3_t, %struct.int64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int64x1x3_t [[TMP6]]
 int64x1x3_t test_vld3_dup_s64(int64_t const * a) {
   return vld3_dup_s64(a);
 }
 
-// CHECK-LABEL: test_vld3_dup_f16
-// CHECK: vld3.16 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.float16x4x3_t @test_vld3_dup_f16(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x4x3_t, %struct.float16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x3_t [[TMP6]]
 float16x4x3_t test_vld3_dup_f16(float16_t const * a) {
   return vld3_dup_f16(a);
 }
 
-// CHECK-LABEL: test_vld3_dup_f32
-// CHECK: vld3.32 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.float32x2x3_t @test_vld3_dup_f32(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK:   [[VLD3:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3r.v2f32.p0f32(float* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float>, <2 x float> } [[VLD3]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x2x3_t, %struct.float32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x3_t [[TMP6]]
 float32x2x3_t test_vld3_dup_f32(float32_t const * a) {
   return vld3_dup_f32(a);
 }
 
-// CHECK-LABEL: test_vld3_dup_p8
-// CHECK: vld3.8 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.poly8x8x3_t @test_vld3_dup_p8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x3_t [[TMP4]]
 poly8x8x3_t test_vld3_dup_p8(poly8_t const * a) {
   return vld3_dup_p8(a);
 }
 
-// CHECK-LABEL: test_vld3_dup_p16
-// CHECK: vld3.16 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.poly16x4x3_t @test_vld3_dup_p16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x3_t [[TMP6]]
 poly16x4x3_t test_vld3_dup_p16(poly16_t const * a) {
   return vld3_dup_p16(a);
 }
 
 
-// CHECK-LABEL: test_vld3q_lane_u16
-// CHECK: vld3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define %struct.uint16x8x3_t @test_vld3q_lane_u16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i8(<8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.uint16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x3_t [[TMP1]]6
 uint16x8x3_t test_vld3q_lane_u16(uint16_t const * a, uint16x8x3_t b) {
   return vld3q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld3q_lane_u32
-// CHECK: vld3.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define %struct.uint32x4x3_t @test_vld3q_lane_u32(i32* %a, [3 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i8(<4 x i32> [[TMP1]]0, <4 x i32> [[TMP1]]1, <4 x i32> [[TMP1]]2, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.uint32x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x3_t [[TMP1]]6
 uint32x4x3_t test_vld3q_lane_u32(uint32_t const * a, uint32x4x3_t b) {
   return vld3q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld3q_lane_s16
-// CHECK: vld3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define %struct.int16x8x3_t @test_vld3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i8(<8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.int16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.int16x8x3_t, %struct.int16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x3_t [[TMP1]]6
 int16x8x3_t test_vld3q_lane_s16(int16_t const * a, int16x8x3_t b) {
   return vld3q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld3q_lane_s32
-// CHECK: vld3.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define %struct.int32x4x3_t @test_vld3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i8(<4 x i32> [[TMP1]]0, <4 x i32> [[TMP1]]1, <4 x i32> [[TMP1]]2, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.int32x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.int32x4x3_t, %struct.int32x4x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x3_t [[TMP1]]6
 int32x4x3_t test_vld3q_lane_s32(int32_t const * a, int32x4x3_t b) {
   return vld3q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld3q_lane_f16
-// CHECK: vld3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define %struct.float16x8x3_t @test_vld3q_lane_f16(half* %a, [3 x <8 x half>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x half>] [[B]].coerce, [3 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i8(<8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.float16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.float16x8x3_t, %struct.float16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float16x8x3_t [[TMP1]]6
 float16x8x3_t test_vld3q_lane_f16(float16_t const * a, float16x8x3_t b) {
   return vld3q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld3q_lane_f32
-// CHECK: vld3.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define %struct.float32x4x3_t @test_vld3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x float>] [[B]].coerce, [3 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3lane.v4f32.p0i8(<4 x float> [[TMP1]]0, <4 x float> [[TMP1]]1, <4 x float> [[TMP1]]2, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x float>, <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float>, <4 x float> } [[VLD3_LANE]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.float32x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.float32x4x3_t, %struct.float32x4x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x3_t [[TMP1]]6
 float32x4x3_t test_vld3q_lane_f32(float32_t const * a, float32x4x3_t b) {
   return vld3q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld3q_lane_p16
-// CHECK: vld3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define %struct.poly16x8x3_t @test_vld3q_lane_p16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i8(<8 x i16> [[TMP1]]0, <8 x i16> [[TMP1]]1, <8 x i16> [[TMP1]]2, i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.poly16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x3_t [[TMP1]]6
 poly16x8x3_t test_vld3q_lane_p16(poly16_t const * a, poly16x8x3_t b) {
   return vld3q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld3_lane_u8
-// CHECK: vld3.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint8x8x3_t @test_vld3_lane_u8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* %a)
+// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP9:%.*]] = load %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x3_t [[TMP9]]
 uint8x8x3_t test_vld3_lane_u8(uint8_t const * a, uint8x8x3_t b) {
   return vld3_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld3_lane_u16
-// CHECK: vld3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint16x4x3_t @test_vld3_lane_u16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i8(<4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.uint16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x3_t [[TMP1]]6
 uint16x4x3_t test_vld3_lane_u16(uint16_t const * a, uint16x4x3_t b) {
   return vld3_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld3_lane_u32
-// CHECK: vld3.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint32x2x3_t @test_vld3_lane_u32(i32* %a, [3 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0i8(<2 x i32> [[TMP1]]0, <2 x i32> [[TMP1]]1, <2 x i32> [[TMP1]]2, i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.uint32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x3_t [[TMP1]]6
 uint32x2x3_t test_vld3_lane_u32(uint32_t const * a, uint32x2x3_t b) {
   return vld3_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld3_lane_s8
-// CHECK: vld3.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int8x8x3_t @test_vld3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* %a)
+// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP9:%.*]] = load %struct.int8x8x3_t, %struct.int8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x3_t [[TMP9]]
 int8x8x3_t test_vld3_lane_s8(int8_t const * a, int8x8x3_t b) {
   return vld3_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld3_lane_s16
-// CHECK: vld3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int16x4x3_t @test_vld3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i8(<4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.int16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.int16x4x3_t, %struct.int16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x3_t [[TMP1]]6
 int16x4x3_t test_vld3_lane_s16(int16_t const * a, int16x4x3_t b) {
   return vld3_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld3_lane_s32
-// CHECK: vld3.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int32x2x3_t @test_vld3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0i8(<2 x i32> [[TMP1]]0, <2 x i32> [[TMP1]]1, <2 x i32> [[TMP1]]2, i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.int32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.int32x2x3_t, %struct.int32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x3_t [[TMP1]]6
 int32x2x3_t test_vld3_lane_s32(int32_t const * a, int32x2x3_t b) {
   return vld3_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld3_lane_f16
-// CHECK: vld3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.float16x4x3_t @test_vld3_lane_f16(half* %a, [3 x <4 x half>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x half>] [[B]].coerce, [3 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i8(<4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.float16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.float16x4x3_t, %struct.float16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x3_t [[TMP1]]6
 float16x4x3_t test_vld3_lane_f16(float16_t const * a, float16x4x3_t b) {
   return vld3_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld3_lane_f32
-// CHECK: vld3.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.float32x2x3_t @test_vld3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x float>] [[B]].coerce, [3 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3lane.v2f32.p0i8(<2 x float> [[TMP1]]0, <2 x float> [[TMP1]]1, <2 x float> [[TMP1]]2, i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x float>, <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float>, <2 x float> } [[VLD3_LANE]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.float32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.float32x2x3_t, %struct.float32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x3_t [[TMP1]]6
 float32x2x3_t test_vld3_lane_f32(float32_t const * a, float32x2x3_t b) {
   return vld3_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld3_lane_p8
-// CHECK: vld3.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.poly8x8x3_t @test_vld3_lane_p8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* %a)
+// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP9:%.*]] = load %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x3_t [[TMP9]]
 poly8x8x3_t test_vld3_lane_p8(poly8_t const * a, poly8x8x3_t b) {
   return vld3_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld3_lane_p16
-// CHECK: vld3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.poly16x4x3_t @test_vld3_lane_p16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i8(<4 x i16> [[TMP1]]0, <4 x i16> [[TMP1]]1, <4 x i16> [[TMP1]]2, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP1]]3
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.poly16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]4, i8* [[TMP1]]5, i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x3_t [[TMP1]]6
 poly16x4x3_t test_vld3_lane_p16(poly16_t const * a, poly16x4x3_t b) {
   return vld3_lane_p16(a, b, 3);
 }
 
 
-// CHECK-LABEL: test_vld4q_u8
-// CHECK: vld4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define %struct.uint8x16x4_t @test_vld4q_u8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0v16i8(<16 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x4_t [[TMP5]]
 uint8x16x4_t test_vld4q_u8(uint8_t const * a) {
   return vld4q_u8(a);
 }
 
-// CHECK-LABEL: test_vld4q_u16
-// CHECK: vld4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define %struct.uint16x8x4_t @test_vld4q_u16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x4_t [[TMP6]]
 uint16x8x4_t test_vld4q_u16(uint16_t const * a) {
   return vld4q_u16(a);
 }
 
-// CHECK-LABEL: test_vld4q_u32
-// CHECK: vld4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define %struct.uint32x4x4_t @test_vld4q_u32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i32>*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x4_t [[TMP6]]
 uint32x4x4_t test_vld4q_u32(uint32_t const * a) {
   return vld4q_u32(a);
 }
 
-// CHECK-LABEL: test_vld4q_s8
-// CHECK: vld4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define %struct.int8x16x4_t @test_vld4q_s8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0v16i8(<16 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x16x4_t, %struct.int8x16x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x4_t [[TMP5]]
 int8x16x4_t test_vld4q_s8(int8_t const * a) {
   return vld4q_s8(a);
 }
 
-// CHECK-LABEL: test_vld4q_s16
-// CHECK: vld4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define %struct.int16x8x4_t @test_vld4q_s16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x8x4_t, %struct.int16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x4_t [[TMP6]]
 int16x8x4_t test_vld4q_s16(int16_t const * a) {
   return vld4q_s16(a);
 }
 
-// CHECK-LABEL: test_vld4q_s32
-// CHECK: vld4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define %struct.int32x4x4_t @test_vld4q_s32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i32>*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x4x4_t, %struct.int32x4x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x4_t [[TMP6]]
 int32x4x4_t test_vld4q_s32(int32_t const * a) {
   return vld4q_s32(a);
 }
 
-// CHECK-LABEL: test_vld4q_f16
-// CHECK: vld4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define %struct.float16x8x4_t @test_vld4q_f16(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x8x4_t, %struct.float16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float16x8x4_t [[TMP6]]
 float16x8x4_t test_vld4q_f16(float16_t const * a) {
   return vld4q_f16(a);
 }
 
-// CHECK-LABEL: test_vld4q_f32
-// CHECK: vld4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define %struct.float32x4x4_t @test_vld4q_f32(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x float>*
+// CHECK:   [[VLD4:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4.v4f32.p0v4f32(<4 x float>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x4x4_t, %struct.float32x4x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x4_t [[TMP6]]
 float32x4x4_t test_vld4q_f32(float32_t const * a) {
   return vld4q_f32(a);
 }
 
-// CHECK-LABEL: test_vld4q_p8
-// CHECK: vld4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define %struct.poly8x16x4_t @test_vld4q_p8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0v16i8(<16 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x4_t [[TMP5]]
 poly8x16x4_t test_vld4q_p8(poly8_t const * a) {
   return vld4q_p8(a);
 }
 
-// CHECK-LABEL: test_vld4q_p16
-// CHECK: vld4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define %struct.poly16x8x4_t @test_vld4q_p16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x4_t [[TMP6]]
 poly16x8x4_t test_vld4q_p16(poly16_t const * a) {
   return vld4q_p16(a);
 }
 
-// CHECK-LABEL: test_vld4_u8
-// CHECK: vld4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint8x8x4_t @test_vld4_u8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x4_t [[TMP5]]
 uint8x8x4_t test_vld4_u8(uint8_t const * a) {
   return vld4_u8(a);
 }
 
-// CHECK-LABEL: test_vld4_u16
-// CHECK: vld4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint16x4x4_t @test_vld4_u16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x4_t [[TMP6]]
 uint16x4x4_t test_vld4_u16(uint16_t const * a) {
   return vld4_u16(a);
 }
 
-// CHECK-LABEL: test_vld4_u32
-// CHECK: vld4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint32x2x4_t @test_vld4_u32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i32>*
+// CHECK:   [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4.v2i32.p0v2i32(<2 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x4_t [[TMP6]]
 uint32x2x4_t test_vld4_u32(uint32_t const * a) {
   return vld4_u32(a);
 }
 
-// CHECK-LABEL: test_vld4_u64
-// CHECK: vld1.64
+// CHECK-LABEL: define %struct.uint64x1x4_t @test_vld4_u64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>*
+// CHECK:   [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0v1i64(<1 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint64x1x4_t [[TMP6]]
 uint64x1x4_t test_vld4_u64(uint64_t const * a) {
   return vld4_u64(a);
 }
 
-// CHECK-LABEL: test_vld4_s8
-// CHECK: vld4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int8x8x4_t @test_vld4_s8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x8x4_t, %struct.int8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x4_t [[TMP5]]
 int8x8x4_t test_vld4_s8(int8_t const * a) {
   return vld4_s8(a);
 }
 
-// CHECK-LABEL: test_vld4_s16
-// CHECK: vld4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int16x4x4_t @test_vld4_s16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x4x4_t, %struct.int16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x4_t [[TMP6]]
 int16x4x4_t test_vld4_s16(int16_t const * a) {
   return vld4_s16(a);
 }
 
-// CHECK-LABEL: test_vld4_s32
-// CHECK: vld4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int32x2x4_t @test_vld4_s32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i32>*
+// CHECK:   [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4.v2i32.p0v2i32(<2 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x2x4_t, %struct.int32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x4_t [[TMP6]]
 int32x2x4_t test_vld4_s32(int32_t const * a) {
   return vld4_s32(a);
 }
 
-// CHECK-LABEL: test_vld4_s64
-// CHECK: vld1.64
+// CHECK-LABEL: define %struct.int64x1x4_t @test_vld4_s64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>*
+// CHECK:   [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0v1i64(<1 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x1x4_t, %struct.int64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int64x1x4_t [[TMP6]]
 int64x1x4_t test_vld4_s64(int64_t const * a) {
   return vld4_s64(a);
 }
 
-// CHECK-LABEL: test_vld4_f16
-// CHECK: vld4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.float16x4x4_t @test_vld4_f16(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x4x4_t, %struct.float16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x4_t [[TMP6]]
 float16x4x4_t test_vld4_f16(float16_t const * a) {
   return vld4_f16(a);
 }
 
-// CHECK-LABEL: test_vld4_f32
-// CHECK: vld4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.float32x2x4_t @test_vld4_f32(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x float>*
+// CHECK:   [[VLD4:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4.v2f32.p0v2f32(<2 x float>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x2x4_t, %struct.float32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x4_t [[TMP6]]
 float32x2x4_t test_vld4_f32(float32_t const * a) {
   return vld4_f32(a);
 }
 
-// CHECK-LABEL: test_vld4_p8
-// CHECK: vld4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.poly8x8x4_t @test_vld4_p8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x4_t [[TMP5]]
 poly8x8x4_t test_vld4_p8(poly8_t const * a) {
   return vld4_p8(a);
 }
 
-// CHECK-LABEL: test_vld4_p16
-// CHECK: vld4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.poly16x4x4_t @test_vld4_p16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x4_t [[TMP6]]
 poly16x4x4_t test_vld4_p16(poly16_t const * a) {
   return vld4_p16(a);
 }
 
 
-// CHECK-LABEL: test_vld4_dup_u8
-// CHECK: vld4.8 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint8x8x4_t @test_vld4_dup_u8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x4_t [[TMP4]]
 uint8x8x4_t test_vld4_dup_u8(uint8_t const * a) {
   return vld4_dup_u8(a);
 }
 
-// CHECK-LABEL: test_vld4_dup_u16
-// CHECK: vld4.16 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint16x4x4_t @test_vld4_dup_u16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x4_t [[TMP6]]
 uint16x4x4_t test_vld4_dup_u16(uint16_t const * a) {
   return vld4_dup_u16(a);
 }
 
-// CHECK-LABEL: test_vld4_dup_u32
-// CHECK: vld4.32 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint32x2x4_t @test_vld4_dup_u32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4r.v2i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x4_t [[TMP6]]
 uint32x2x4_t test_vld4_dup_u32(uint32_t const * a) {
   return vld4_dup_u32(a);
 }
 
-// CHECK-LABEL: test_vld4_dup_u64
-// CHECK: vld1.64
+// CHECK-LABEL: define %struct.uint64x1x4_t @test_vld4_dup_u64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint64x1x4_t [[TMP6]]
 uint64x1x4_t test_vld4_dup_u64(uint64_t const * a) {
   return vld4_dup_u64(a);
 }
 
-// CHECK-LABEL: test_vld4_dup_s8
-// CHECK: vld4.8 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int8x8x4_t @test_vld4_dup_s8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.int8x8x4_t, %struct.int8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x4_t [[TMP4]]
 int8x8x4_t test_vld4_dup_s8(int8_t const * a) {
   return vld4_dup_s8(a);
 }
 
-// CHECK-LABEL: test_vld4_dup_s16
-// CHECK: vld4.16 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int16x4x4_t @test_vld4_dup_s16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x4x4_t, %struct.int16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x4_t [[TMP6]]
 int16x4x4_t test_vld4_dup_s16(int16_t const * a) {
   return vld4_dup_s16(a);
 }
 
-// CHECK-LABEL: test_vld4_dup_s32
-// CHECK: vld4.32 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int32x2x4_t @test_vld4_dup_s32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4r.v2i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x2x4_t, %struct.int32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x4_t [[TMP6]]
 int32x2x4_t test_vld4_dup_s32(int32_t const * a) {
   return vld4_dup_s32(a);
 }
 
-// CHECK-LABEL: test_vld4_dup_s64
-// CHECK: vld1.64
+// CHECK-LABEL: define %struct.int64x1x4_t @test_vld4_dup_s64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x1x4_t, %struct.int64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int64x1x4_t [[TMP6]]
 int64x1x4_t test_vld4_dup_s64(int64_t const * a) {
   return vld4_dup_s64(a);
 }
 
-// CHECK-LABEL: test_vld4_dup_f16
-// CHECK: vld4.16 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.float16x4x4_t @test_vld4_dup_f16(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x4x4_t, %struct.float16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x4_t [[TMP6]]
 float16x4x4_t test_vld4_dup_f16(float16_t const * a) {
   return vld4_dup_f16(a);
 }
 
-// CHECK-LABEL: test_vld4_dup_f32
-// CHECK: vld4.32 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.float32x2x4_t @test_vld4_dup_f32(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK:   [[VLD4:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4r.v2f32.p0f32(float* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x2x4_t, %struct.float32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x4_t [[TMP6]]
 float32x2x4_t test_vld4_dup_f32(float32_t const * a) {
   return vld4_dup_f32(a);
 }
 
-// CHECK-LABEL: test_vld4_dup_p8
-// CHECK: vld4.8 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.poly8x8x4_t @test_vld4_dup_p8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x4_t [[TMP4]]
 poly8x8x4_t test_vld4_dup_p8(poly8_t const * a) {
   return vld4_dup_p8(a);
 }
 
-// CHECK-LABEL: test_vld4_dup_p16
-// CHECK: vld4.16 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.poly16x4x4_t @test_vld4_dup_p16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x4_t [[TMP6]]
 poly16x4x4_t test_vld4_dup_p16(poly16_t const * a) {
   return vld4_dup_p16(a);
 }
 
 
-// CHECK-LABEL: test_vld4q_lane_u16
-// CHECK: vld4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define %struct.uint16x8x4_t @test_vld4q_lane_u16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP1]]0 to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP1]]1 to <8 x i16>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i8(<8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, <8 x i16> [[TMP1]]5, i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.uint16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x4_t [[TMP1]]9
 uint16x8x4_t test_vld4q_lane_u16(uint16_t const * a, uint16x8x4_t b) {
   return vld4q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld4q_lane_u32
-// CHECK: vld4.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define %struct.uint32x4x4_t @test_vld4q_lane_u32(i32* %a, [4 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP1]]0 to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP1]]1 to <4 x i32>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i8(<4 x i32> [[TMP1]]2, <4 x i32> [[TMP1]]3, <4 x i32> [[TMP1]]4, <4 x i32> [[TMP1]]5, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.uint32x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x4_t [[TMP1]]9
 uint32x4x4_t test_vld4q_lane_u32(uint32_t const * a, uint32x4x4_t b) {
   return vld4q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld4q_lane_s16
-// CHECK: vld4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define %struct.int16x8x4_t @test_vld4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP1]]0 to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP1]]1 to <8 x i16>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i8(<8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, <8 x i16> [[TMP1]]5, i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.int16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.int16x8x4_t, %struct.int16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x4_t [[TMP1]]9
 int16x8x4_t test_vld4q_lane_s16(int16_t const * a, int16x8x4_t b) {
   return vld4q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld4q_lane_s32
-// CHECK: vld4.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define %struct.int32x4x4_t @test_vld4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP1]]0 to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP1]]1 to <4 x i32>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i8(<4 x i32> [[TMP1]]2, <4 x i32> [[TMP1]]3, <4 x i32> [[TMP1]]4, <4 x i32> [[TMP1]]5, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.int32x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.int32x4x4_t, %struct.int32x4x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x4_t [[TMP1]]9
 int32x4x4_t test_vld4q_lane_s32(int32_t const * a, int32x4x4_t b) {
   return vld4q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld4q_lane_f16
-// CHECK: vld4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define %struct.float16x8x4_t @test_vld4q_lane_f16(half* %a, [4 x <8 x half>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x half>] [[B]].coerce, [4 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x half> [[TMP1]]0 to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP1]]1 to <8 x i16>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i8(<8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, <8 x i16> [[TMP1]]5, i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.float16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.float16x8x4_t, %struct.float16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float16x8x4_t [[TMP1]]9
 float16x8x4_t test_vld4q_lane_f16(float16_t const * a, float16x8x4_t b) {
   return vld4q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld4q_lane_f32
-// CHECK: vld4.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define %struct.float32x4x4_t @test_vld4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x float>] [[B]].coerce, [4 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x float> [[TMP1]]0 to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP1]]1 to <4 x float>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4lane.v4f32.p0i8(<4 x float> [[TMP1]]2, <4 x float> [[TMP1]]3, <4 x float> [[TMP1]]4, <4 x float> [[TMP1]]5, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x float>, <4 x float>, <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4_LANE]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.float32x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.float32x4x4_t, %struct.float32x4x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x4_t [[TMP1]]9
 float32x4x4_t test_vld4q_lane_f32(float32_t const * a, float32x4x4_t b) {
   return vld4q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld4q_lane_p16
-// CHECK: vld4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define %struct.poly16x8x4_t @test_vld4q_lane_p16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]2, align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]4, align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]]6, align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP1]]0 to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP1]]1 to <8 x i16>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i8(<8 x i16> [[TMP1]]2, <8 x i16> [[TMP1]]3, <8 x i16> [[TMP1]]4, <8 x i16> [[TMP1]]5, i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.poly16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x4_t [[TMP1]]9
 poly16x8x4_t test_vld4q_lane_p16(poly16_t const * a, poly16x8x4_t b) {
   return vld4q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld4_lane_u8
-// CHECK: vld4.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint8x8x4_t @test_vld4_lane_u8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i64 7, i8* %a)
+// CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP9:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP10:%.*]] = load %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x4_t [[TMP1]]0
 uint8x8x4_t test_vld4_lane_u8(uint8_t const * a, uint8x8x4_t b) {
   return vld4_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld4_lane_u16
-// CHECK: vld4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint16x4x4_t @test_vld4_lane_u16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP1]]0 to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP1]]1 to <4 x i16>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i8(<4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, <4 x i16> [[TMP1]]5, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.uint16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x4_t [[TMP1]]9
 uint16x4x4_t test_vld4_lane_u16(uint16_t const * a, uint16x4x4_t b) {
   return vld4_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld4_lane_u32
-// CHECK: vld4.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.uint32x2x4_t @test_vld4_lane_u32(i32* %a, [4 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <2 x i32> [[TMP1]]0 to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP1]]1 to <2 x i32>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0i8(<2 x i32> [[TMP1]]2, <2 x i32> [[TMP1]]3, <2 x i32> [[TMP1]]4, <2 x i32> [[TMP1]]5, i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.uint32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x4_t [[TMP1]]9
 uint32x2x4_t test_vld4_lane_u32(uint32_t const * a, uint32x2x4_t b) {
   return vld4_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld4_lane_s8
-// CHECK: vld4.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int8x8x4_t @test_vld4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i64 7, i8* %a)
+// CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP9:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP10:%.*]] = load %struct.int8x8x4_t, %struct.int8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x4_t [[TMP1]]0
 int8x8x4_t test_vld4_lane_s8(int8_t const * a, int8x8x4_t b) {
   return vld4_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld4_lane_s16
-// CHECK: vld4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int16x4x4_t @test_vld4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP1]]0 to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP1]]1 to <4 x i16>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i8(<4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, <4 x i16> [[TMP1]]5, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.int16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.int16x4x4_t, %struct.int16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x4_t [[TMP1]]9
 int16x4x4_t test_vld4_lane_s16(int16_t const * a, int16x4x4_t b) {
   return vld4_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld4_lane_s32
-// CHECK: vld4.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.int32x2x4_t @test_vld4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <2 x i32> [[TMP1]]0 to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP1]]1 to <2 x i32>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0i8(<2 x i32> [[TMP1]]2, <2 x i32> [[TMP1]]3, <2 x i32> [[TMP1]]4, <2 x i32> [[TMP1]]5, i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.int32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.int32x2x4_t, %struct.int32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x4_t [[TMP1]]9
 int32x2x4_t test_vld4_lane_s32(int32_t const * a, int32x2x4_t b) {
   return vld4_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld4_lane_f16
-// CHECK: vld4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.float16x4x4_t @test_vld4_lane_f16(half* %a, [4 x <4 x half>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x half>] [[B]].coerce, [4 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x half> [[TMP1]]0 to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP1]]1 to <4 x i16>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i8(<4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, <4 x i16> [[TMP1]]5, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.float16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.float16x4x4_t, %struct.float16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x4_t [[TMP1]]9
 float16x4x4_t test_vld4_lane_f16(float16_t const * a, float16x4x4_t b) {
   return vld4_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld4_lane_f32
-// CHECK: vld4.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.float32x2x4_t @test_vld4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x float>] [[B]].coerce, [4 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <2 x float> [[TMP1]]0 to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP1]]1 to <2 x float>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4lane.v2f32.p0i8(<2 x float> [[TMP1]]2, <2 x float> [[TMP1]]3, <2 x float> [[TMP1]]4, <2 x float> [[TMP1]]5, i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_LANE]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.float32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.float32x2x4_t, %struct.float32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x4_t [[TMP1]]9
 float32x2x4_t test_vld4_lane_f32(float32_t const * a, float32x2x4_t b) {
   return vld4_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld4_lane_p8
-// CHECK: vld4.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.poly8x8x4_t @test_vld4_lane_p8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i64 7, i8* %a)
+// CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP9:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP10:%.*]] = load %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x4_t [[TMP1]]0
 poly8x8x4_t test_vld4_lane_p8(poly8_t const * a, poly8x8x4_t b) {
   return vld4_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld4_lane_p16
-// CHECK: vld4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define %struct.poly16x4x4_t @test_vld4_lane_p16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]1, i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]2, align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]3, i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]4, align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]]5, i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]]6, align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP1]]0 to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP1]]1 to <4 x i16>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i8(<4 x i16> [[TMP1]]2, <4 x i16> [[TMP1]]3, <4 x i16> [[TMP1]]4, <4 x i16> [[TMP1]]5, i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP1]]6
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.poly16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP1]]7, i8* [[TMP1]]8, i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x4_t [[TMP1]]9
 poly16x4x4_t test_vld4_lane_p16(poly16_t const * a, poly16x4x4_t b) {
   return vld4_lane_p16(a, b, 3);
 }
 
 
-// CHECK-LABEL: test_vmax_s8
-// CHECK: vmax.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmax_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.smax.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VMAX_I]]
 int8x8_t test_vmax_s8(int8x8_t a, int8x8_t b) {
   return vmax_s8(a, b);
 }
 
-// CHECK-LABEL: test_vmax_s16
-// CHECK: vmax.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmax_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.smax.v4i16(<4 x i16> [[VMAX_I]], <4 x i16> [[VMAX1_I]]) #4
+// CHECK:   ret <4 x i16> [[VMAX2_I]]
 int16x4_t test_vmax_s16(int16x4_t a, int16x4_t b) {
   return vmax_s16(a, b);
 }
 
-// CHECK-LABEL: test_vmax_s32
-// CHECK: vmax.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmax_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.smax.v2i32(<2 x i32> [[VMAX_I]], <2 x i32> [[VMAX1_I]]) #4
+// CHECK:   ret <2 x i32> [[VMAX2_I]]
 int32x2_t test_vmax_s32(int32x2_t a, int32x2_t b) {
   return vmax_s32(a, b);
 }
 
-// CHECK-LABEL: test_vmax_u8
-// CHECK: vmax.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmax_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.umax.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VMAX_I]]
 uint8x8_t test_vmax_u8(uint8x8_t a, uint8x8_t b) {
   return vmax_u8(a, b);
 }
 
-// CHECK-LABEL: test_vmax_u16
-// CHECK: vmax.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmax_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.umax.v4i16(<4 x i16> [[VMAX_I]], <4 x i16> [[VMAX1_I]]) #4
+// CHECK:   ret <4 x i16> [[VMAX2_I]]
 uint16x4_t test_vmax_u16(uint16x4_t a, uint16x4_t b) {
   return vmax_u16(a, b);
 }
 
-// CHECK-LABEL: test_vmax_u32
-// CHECK: vmax.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmax_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.umax.v2i32(<2 x i32> [[VMAX_I]], <2 x i32> [[VMAX1_I]]) #4
+// CHECK:   ret <2 x i32> [[VMAX2_I]]
 uint32x2_t test_vmax_u32(uint32x2_t a, uint32x2_t b) {
   return vmax_u32(a, b);
 }
 
-// CHECK-LABEL: test_vmax_f32
-// CHECK: vmax.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vmax_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VMAX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmax.v2f32(<2 x float> [[VMAX_I]], <2 x float> [[VMAX1_I]]) #4
+// CHECK:   ret <2 x float> [[VMAX2_I]]
 float32x2_t test_vmax_f32(float32x2_t a, float32x2_t b) {
   return vmax_f32(a, b);
 }
 
-// CHECK-LABEL: test_vmaxq_s8
-// CHECK: vmax.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vmaxq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VMAX_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.smax.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VMAX_I]]
 int8x16_t test_vmaxq_s8(int8x16_t a, int8x16_t b) {
   return vmaxq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vmaxq_s16
-// CHECK: vmax.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmaxq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smax.v8i16(<8 x i16> [[VMAX_I]], <8 x i16> [[VMAX1_I]]) #4
+// CHECK:   ret <8 x i16> [[VMAX2_I]]
 int16x8_t test_vmaxq_s16(int16x8_t a, int16x8_t b) {
   return vmaxq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vmaxq_s32
-// CHECK: vmax.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmaxq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> [[VMAX_I]], <4 x i32> [[VMAX1_I]]) #4
+// CHECK:   ret <4 x i32> [[VMAX2_I]]
 int32x4_t test_vmaxq_s32(int32x4_t a, int32x4_t b) {
   return vmaxq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vmaxq_u8
-// CHECK: vmax.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vmaxq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VMAX_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.umax.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VMAX_I]]
 uint8x16_t test_vmaxq_u8(uint8x16_t a, uint8x16_t b) {
   return vmaxq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vmaxq_u16
-// CHECK: vmax.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmaxq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umax.v8i16(<8 x i16> [[VMAX_I]], <8 x i16> [[VMAX1_I]]) #4
+// CHECK:   ret <8 x i16> [[VMAX2_I]]
 uint16x8_t test_vmaxq_u16(uint16x8_t a, uint16x8_t b) {
   return vmaxq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vmaxq_u32
-// CHECK: vmax.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmaxq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> [[VMAX_I]], <4 x i32> [[VMAX1_I]]) #4
+// CHECK:   ret <4 x i32> [[VMAX2_I]]
 uint32x4_t test_vmaxq_u32(uint32x4_t a, uint32x4_t b) {
   return vmaxq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vmaxq_f32
-// CHECK: vmax.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vmaxq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VMAX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float> [[VMAX_I]], <4 x float> [[VMAX1_I]]) #4
+// CHECK:   ret <4 x float> [[VMAX2_I]]
 float32x4_t test_vmaxq_f32(float32x4_t a, float32x4_t b) {
   return vmaxq_f32(a, b);
 }
 
 
-// CHECK-LABEL: test_vmin_s8
-// CHECK: vmin.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmin_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.smin.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VMIN_I]]
 int8x8_t test_vmin_s8(int8x8_t a, int8x8_t b) {
   return vmin_s8(a, b);
 }
 
-// CHECK-LABEL: test_vmin_s16
-// CHECK: vmin.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmin_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.smin.v4i16(<4 x i16> [[VMIN_I]], <4 x i16> [[VMIN1_I]]) #4
+// CHECK:   ret <4 x i16> [[VMIN2_I]]
 int16x4_t test_vmin_s16(int16x4_t a, int16x4_t b) {
   return vmin_s16(a, b);
 }
 
-// CHECK-LABEL: test_vmin_s32
-// CHECK: vmin.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmin_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.smin.v2i32(<2 x i32> [[VMIN_I]], <2 x i32> [[VMIN1_I]]) #4
+// CHECK:   ret <2 x i32> [[VMIN2_I]]
 int32x2_t test_vmin_s32(int32x2_t a, int32x2_t b) {
   return vmin_s32(a, b);
 }
 
-// CHECK-LABEL: test_vmin_u8
-// CHECK: vmin.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmin_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.umin.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VMIN_I]]
 uint8x8_t test_vmin_u8(uint8x8_t a, uint8x8_t b) {
   return vmin_u8(a, b);
 }
 
-// CHECK-LABEL: test_vmin_u16
-// CHECK: vmin.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmin_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.umin.v4i16(<4 x i16> [[VMIN_I]], <4 x i16> [[VMIN1_I]]) #4
+// CHECK:   ret <4 x i16> [[VMIN2_I]]
 uint16x4_t test_vmin_u16(uint16x4_t a, uint16x4_t b) {
   return vmin_u16(a, b);
 }
 
-// CHECK-LABEL: test_vmin_u32
-// CHECK: vmin.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmin_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.umin.v2i32(<2 x i32> [[VMIN_I]], <2 x i32> [[VMIN1_I]]) #4
+// CHECK:   ret <2 x i32> [[VMIN2_I]]
 uint32x2_t test_vmin_u32(uint32x2_t a, uint32x2_t b) {
   return vmin_u32(a, b);
 }
 
-// CHECK-LABEL: test_vmin_f32
-// CHECK: vmin.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vmin_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VMIN2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmin.v2f32(<2 x float> [[VMIN_I]], <2 x float> [[VMIN1_I]]) #4
+// CHECK:   ret <2 x float> [[VMIN2_I]]
 float32x2_t test_vmin_f32(float32x2_t a, float32x2_t b) {
   return vmin_f32(a, b);
 }
 
-// CHECK-LABEL: test_vminq_s8
-// CHECK: vmin.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vminq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VMIN_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.smin.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VMIN_I]]
 int8x16_t test_vminq_s8(int8x16_t a, int8x16_t b) {
   return vminq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vminq_s16
-// CHECK: vmin.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vminq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smin.v8i16(<8 x i16> [[VMIN_I]], <8 x i16> [[VMIN1_I]]) #4
+// CHECK:   ret <8 x i16> [[VMIN2_I]]
 int16x8_t test_vminq_s16(int16x8_t a, int16x8_t b) {
   return vminq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vminq_s32
-// CHECK: vmin.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vminq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> [[VMIN_I]], <4 x i32> [[VMIN1_I]]) #4
+// CHECK:   ret <4 x i32> [[VMIN2_I]]
 int32x4_t test_vminq_s32(int32x4_t a, int32x4_t b) {
   return vminq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vminq_u8
-// CHECK: vmin.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vminq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VMIN_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.umin.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VMIN_I]]
 uint8x16_t test_vminq_u8(uint8x16_t a, uint8x16_t b) {
   return vminq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vminq_u16
-// CHECK: vmin.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vminq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umin.v8i16(<8 x i16> [[VMIN_I]], <8 x i16> [[VMIN1_I]]) #4
+// CHECK:   ret <8 x i16> [[VMIN2_I]]
 uint16x8_t test_vminq_u16(uint16x8_t a, uint16x8_t b) {
   return vminq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vminq_u32
-// CHECK: vmin.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vminq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> [[VMIN_I]], <4 x i32> [[VMIN1_I]]) #4
+// CHECK:   ret <4 x i32> [[VMIN2_I]]
 uint32x4_t test_vminq_u32(uint32x4_t a, uint32x4_t b) {
   return vminq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vminq_f32
-// CHECK: vmin.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vminq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VMIN2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float> [[VMIN_I]], <4 x float> [[VMIN1_I]]) #4
+// CHECK:   ret <4 x float> [[VMIN2_I]]
 float32x4_t test_vminq_f32(float32x4_t a, float32x4_t b) {
   return vminq_f32(a, b);
 }
 
 
-// CHECK-LABEL: test_vmla_s8
-// CHECK: vmla.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmla_s8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %b, %c
+// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, [[MUL_I]]
+// CHECK:   ret <8 x i8> [[ADD_I]]
 int8x8_t test_vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
   return vmla_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vmla_s16
-// CHECK: vmla.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmla_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, %c
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
+// CHECK:   ret <4 x i16> [[ADD_I]]
 int16x4_t test_vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
   return vmla_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmla_s32
-// CHECK: vmla.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmla_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, %c
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[ADD_I]]
 int32x2_t test_vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
   return vmla_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmla_f32
-// CHECK-SWIFT: vmul.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-// CHECK-SWIFT: vadd.f32
-// CHECK-A57: vmla.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vmla_f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %b, %c
+// CHECK:   [[ADD_I:%.*]] = fadd <2 x float> %a, [[MUL_I]]
+// CHECK:   ret <2 x float> [[ADD_I]]
 float32x2_t test_vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
   return vmla_f32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmla_u8
-// CHECK: vmla.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmla_u8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %b, %c
+// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, [[MUL_I]]
+// CHECK:   ret <8 x i8> [[ADD_I]]
 uint8x8_t test_vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
   return vmla_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vmla_u16
-// CHECK: vmla.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmla_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, %c
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
+// CHECK:   ret <4 x i16> [[ADD_I]]
 uint16x4_t test_vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmla_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmla_u32
-// CHECK: vmla.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmla_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, %c
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[ADD_I]]
 uint32x2_t test_vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmla_u32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlaq_s8
-// CHECK: vmla.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vmlaq_s8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %b, %c
+// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, [[MUL_I]]
+// CHECK:   ret <16 x i8> [[ADD_I]]
 int8x16_t test_vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) {
   return vmlaq_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlaq_s16
-// CHECK: vmla.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, %c
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
   return vmlaq_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlaq_s32
-// CHECK: vmla.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, %c
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
   return vmlaq_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlaq_f32
-// CHECK-SWIFT: vmul.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
-// CHECK-SWIFT: vadd.f32
-// CHECK-A57: vmla.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vmlaq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %b, %c
+// CHECK:   [[ADD_I:%.*]] = fadd <4 x float> %a, [[MUL_I]]
+// CHECK:   ret <4 x float> [[ADD_I]]
 float32x4_t test_vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
   return vmlaq_f32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlaq_u8
-// CHECK: vmla.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vmlaq_u8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %b, %c
+// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, [[MUL_I]]
+// CHECK:   ret <16 x i8> [[ADD_I]]
 uint8x16_t test_vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
   return vmlaq_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlaq_u16
-// CHECK: vmla.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_u16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, %c
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) {
   return vmlaq_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlaq_u32
-// CHECK: vmla.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_u32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, %c
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) {
   return vmlaq_u32(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vmlal_s8
-// CHECK: vmlal.s8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmlal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
   return vmlal_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlal_s16
-// CHECK: vmlal.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vmlal_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlal_s32
-// CHECK: vmlal.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vmlal_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlal_u8
-// CHECK: vmlal.u8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmlal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
   return vmlal_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlal_u16
-// CHECK: vmlal.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmlal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmlal_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlal_u32
-// CHECK: vmlal.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vmlal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmlal_u32(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vmlal_lane_s16
-// CHECK: vmlal.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vmlal_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vmlal_lane_s32
-// CHECK: vmlal.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i64> @test_vmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vmlal_lane_s32(a, b, c, 1);
 }
 
-// CHECK-LABEL: test_vmlal_lane_u16
-// CHECK: vmlal.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vmlal_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 uint32x4_t test_vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmlal_lane_u16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vmlal_lane_u32
-// CHECK: vmlal.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i64> @test_vmlal_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 uint64x2_t test_vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmlal_lane_u32(a, b, c, 1);
 }
 
 
-// CHECK-LABEL: test_vmlal_n_s16
-// CHECK: vmlal.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmlal_n_s16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
   return vmlal_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlal_n_s32
-// CHECK: vmlal.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vmlal_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
   return vmlal_n_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlal_n_u16
-// CHECK: vmlal.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmlal_n_u16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
   return vmlal_n_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlal_n_u32
-// CHECK: vmlal.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vmlal_n_u32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
   return vmlal_n_u32(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vmla_lane_s16
-// CHECK: vmla.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i16> @test_vmla_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[ADD]]
 int16x4_t test_vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
   return vmla_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vmla_lane_s32
-// CHECK: vmla.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i32> @test_vmla_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[ADD]]
 int32x2_t test_vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
   return vmla_lane_s32(a, b, c, 1);
 }
 
-// CHECK-LABEL: test_vmla_lane_u16
-// CHECK: vmla.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i16> @test_vmla_lane_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[ADD]]
 uint16x4_t test_vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmla_lane_u16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vmla_lane_u32
-// CHECK: vmla.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i32> @test_vmla_lane_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[ADD]]
 uint32x2_t test_vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmla_lane_u32(a, b, c, 1);
 }
 
-// CHECK-LABEL: test_vmla_lane_f32
-// CHECK-SWIFT: vmul.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
-// CHECK-SWIFT: vadd.f32
-// CHECK-A57: vmla.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x float> @test_vmla_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]]
+// CHECK:   ret <2 x float> [[ADD]]
 float32x2_t test_vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
   return vmla_lane_f32(a, b, c, 1);
 }
 
-// CHECK-LABEL: test_vmlaq_lane_s16
-// CHECK: vmla.i16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[ADD]]
 int16x8_t test_vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) {
   return vmlaq_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vmlaq_lane_s32
-// CHECK: vmla.i32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) {
   return vmlaq_lane_s32(a, b, c, 1);
 }
 
-// CHECK-LABEL: test_vmlaq_lane_u16
-// CHECK: vmla.i16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_u16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[ADD]]
 uint16x8_t test_vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t c) {
   return vmlaq_lane_u16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vmlaq_lane_u32
-// CHECK: vmla.i32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_u32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[ADD]]
 uint32x4_t test_vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t c) {
   return vmlaq_lane_u32(a, b, c, 1);
 }
 
-// CHECK-LABEL: test_vmlaq_lane_f32
-// CHECK-SWIFT: vmul.f32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
-// CHECK-SWIFT: vadd.f32
-// CHECK-A57: vmla.f32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x float> @test_vmlaq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]]
+// CHECK:   ret <4 x float> [[ADD]]
 float32x4_t test_vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t c) {
   return vmlaq_lane_f32(a, b, c, 1);
 }
 
 
-// CHECK-LABEL: test_vmla_n_s16
-// CHECK: vmla.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmla_n_s16(<4 x i16> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
+// CHECK:   ret <4 x i16> [[ADD_I]]
 int16x4_t test_vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
   return vmla_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmla_n_s32
-// CHECK: vmla.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmla_n_s32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[ADD_I]]
 int32x2_t test_vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
   return vmla_n_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmla_n_u16
-// CHECK: vmla.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmla_n_u16(<4 x i16> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
+// CHECK:   ret <4 x i16> [[ADD_I]]
 uint16x4_t test_vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
   return vmla_n_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmla_n_u32
-// CHECK: vmla.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmla_n_u32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[ADD_I]]
 uint32x2_t test_vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
   return vmla_n_u32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmla_n_f32
-// CHECK-SWIFT: vmul.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-// CHECK-SWIFT: vadd.f32
-// CHECK-A57: vmla.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vmla_n_f32(<2 x float> %a, <2 x float> %b, float %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %b, [[VECINIT1_I]]
+// CHECK:   [[ADD_I:%.*]] = fadd <2 x float> %a, [[MUL_I]]
+// CHECK:   ret <2 x float> [[ADD_I]]
 float32x2_t test_vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c) {
   return vmla_n_f32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlaq_n_s16
-// CHECK: vmla.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_n_s16(<8 x i16> %a, <8 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
   return vmlaq_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlaq_n_s32
-// CHECK: vmla.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
   return vmlaq_n_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlaq_n_u16
-// CHECK: vmla.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_n_u16(<8 x i16> %a, <8 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
   return vmlaq_n_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlaq_n_u32
-// CHECK: vmla.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_n_u32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
   return vmlaq_n_u32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlaq_n_f32
-// CHECK-SWIFT: vmul.f32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[0]
-// CHECK-SWIFT: vadd.f32
-// CHECK-A57: vld1.32 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, 
-// CHECK-A57: vmla.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vmlaq_n_f32(<4 x float> %a, <4 x float> %b, float %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %b, [[VECINIT3_I]]
+// CHECK:   [[ADD_I:%.*]] = fadd <4 x float> %a, [[MUL_I]]
+// CHECK:   ret <4 x float> [[ADD_I]]
 float32x4_t test_vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c) {
   return vmlaq_n_f32(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vmls_s8
-// CHECK: vmls.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmls_s8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %b, %c
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %a, [[MUL_I]]
+// CHECK:   ret <8 x i8> [[SUB_I]]
 int8x8_t test_vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
   return vmls_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vmls_s16
-// CHECK: vmls.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmls_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, %c
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
+// CHECK:   ret <4 x i16> [[SUB_I]]
 int16x4_t test_vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
   return vmls_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmls_s32
-// CHECK: vmls.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmls_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, %c
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[SUB_I]]
 int32x2_t test_vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
   return vmls_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmls_f32
-// CHECK-SWIFT: vmul.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-// CHECK-SWIFT: vsub.f32
-// CHECK-A57: vmls.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vmls_f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %b, %c
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x float> %a, [[MUL_I]]
+// CHECK:   ret <2 x float> [[SUB_I]]
 float32x2_t test_vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
   return vmls_f32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmls_u8
-// CHECK: vmls.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmls_u8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %b, %c
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %a, [[MUL_I]]
+// CHECK:   ret <8 x i8> [[SUB_I]]
 uint8x8_t test_vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
   return vmls_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vmls_u16
-// CHECK: vmls.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmls_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, %c
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
+// CHECK:   ret <4 x i16> [[SUB_I]]
 uint16x4_t test_vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmls_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmls_u32
-// CHECK: vmls.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmls_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, %c
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[SUB_I]]
 uint32x2_t test_vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmls_u32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsq_s8
-// CHECK: vmls.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vmlsq_s8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %b, %c
+// CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %a, [[MUL_I]]
+// CHECK:   ret <16 x i8> [[SUB_I]]
 int8x16_t test_vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c) {
   return vmlsq_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsq_s16
-// CHECK: vmls.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, %c
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
   return vmlsq_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsq_s32
-// CHECK: vmls.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, %c
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
   return vmlsq_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsq_f32
-// CHECK-SWIFT: vmul.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
-// CHECK-SWIFT: vsub.f32
-// CHECK-A57: vmls.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vmlsq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %b, %c
+// CHECK:   [[SUB_I:%.*]] = fsub <4 x float> %a, [[MUL_I]]
+// CHECK:   ret <4 x float> [[SUB_I]]
 float32x4_t test_vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
   return vmlsq_f32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsq_u8
-// CHECK: vmls.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vmlsq_u8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %b, %c
+// CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %a, [[MUL_I]]
+// CHECK:   ret <16 x i8> [[SUB_I]]
 uint8x16_t test_vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
   return vmlsq_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsq_u16
-// CHECK: vmls.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_u16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, %c
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 uint16x8_t test_vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) {
   return vmlsq_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsq_u32
-// CHECK: vmls.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_u32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, %c
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) {
   return vmlsq_u32(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vmlsl_s8
-// CHECK: vmlsl.s8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmlsl_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
   return vmlsl_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsl_s16
-// CHECK: vmlsl.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vmlsl_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsl_s32
-// CHECK: vmlsl.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vmlsl_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsl_u8
-// CHECK: vmlsl.u8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmlsl_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 uint16x8_t test_vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
   return vmlsl_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsl_u16
-// CHECK: vmlsl.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmlsl_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsl_u32
-// CHECK: vmlsl.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 uint64x2_t test_vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmlsl_u32(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vmlsl_lane_s16
-// CHECK: vmlsl.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vmlsl_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vmlsl_lane_s32
-// CHECK: vmlsl.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vmlsl_lane_s32(a, b, c, 1);
 }
 
-// CHECK-LABEL: test_vmlsl_lane_u16
-// CHECK: vmlsl.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 uint32x4_t test_vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmlsl_lane_u16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vmlsl_lane_u32
-// CHECK: vmlsl.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 uint64x2_t test_vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmlsl_lane_u32(a, b, c, 1);
 }
 
 
-// CHECK-LABEL: test_vmlsl_n_s16
-// CHECK: vmlsl.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_n_s16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
   return vmlsl_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsl_n_s32
-// CHECK: vmlsl.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
   return vmlsl_n_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsl_n_u16
-// CHECK: vmlsl.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_n_u16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
   return vmlsl_n_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsl_n_u32
-// CHECK: vmlsl.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_n_u32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 uint64x2_t test_vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
   return vmlsl_n_u32(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vmls_lane_s16
-// CHECK: vmls.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i16> @test_vmls_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[SUB]]
 int16x4_t test_vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
   return vmls_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vmls_lane_s32
-// CHECK: vmls.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i32> @test_vmls_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[SUB]]
 int32x2_t test_vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
   return vmls_lane_s32(a, b, c, 1);
 }
 
-// CHECK-LABEL: test_vmls_lane_u16
-// CHECK: vmls.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i16> @test_vmls_lane_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[SUB]]
 uint16x4_t test_vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmls_lane_u16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vmls_lane_u32
-// CHECK: vmls.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i32> @test_vmls_lane_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[SUB]]
 uint32x2_t test_vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmls_lane_u32(a, b, c, 1);
 }
 
-// CHECK-LABEL: test_vmls_lane_f32
-// CHECK-SWIFT: vmul.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
-// CHECK-SWIFT: vsub.f32
-// CHECK-A57: vmls.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x float> @test_vmls_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]]
+// CHECK:   ret <2 x float> [[SUB]]
 float32x2_t test_vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
   return vmls_lane_f32(a, b, c, 1);
 }
 
-// CHECK-LABEL: test_vmlsq_lane_s16
-// CHECK: vmls.i16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[SUB]]
 int16x8_t test_vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) {
   return vmlsq_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vmlsq_lane_s32
-// CHECK: vmls.i32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) {
   return vmlsq_lane_s32(a, b, c, 1);
 }
 
-// CHECK-LABEL: test_vmlsq_lane_u16
-// CHECK: vmls.i16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_u16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[SUB]]
 uint16x8_t test_vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t c) {
   return vmlsq_lane_u16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vmlsq_lane_u32
-// CHECK: vmls.i32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_u32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[SUB]]
 uint32x4_t test_vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t c) {
   return vmlsq_lane_u32(a, b, c, 1);
 }
 
-// CHECK-LABEL: test_vmlsq_lane_f32
-// CHECK-SWIFT: vmul.f32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
-// CHECK-SWIFT: vsub.f32
-// CHECK-A57: vmls.f32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x float> @test_vmlsq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]]
+// CHECK:   ret <4 x float> [[SUB]]
 float32x4_t test_vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t c) {
   return vmlsq_lane_f32(a, b, c, 1);
 }
 
 
-// CHECK-LABEL: test_vmls_n_s16
-// CHECK: vmls.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmls_n_s16(<4 x i16> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
+// CHECK:   ret <4 x i16> [[SUB_I]]
 int16x4_t test_vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
   return vmls_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmls_n_s32
-// CHECK: vmls.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmls_n_s32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[SUB_I]]
 int32x2_t test_vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
   return vmls_n_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmls_n_u16
-// CHECK: vmls.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmls_n_u16(<4 x i16> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
+// CHECK:   ret <4 x i16> [[SUB_I]]
 uint16x4_t test_vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
   return vmls_n_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmls_n_u32
-// CHECK: vmls.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmls_n_u32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[SUB_I]]
 uint32x2_t test_vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
   return vmls_n_u32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmls_n_f32
-// CHECK-SWIFT: vmul.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-// CHECK-SWIFT: vsub.f32
-// CHECK-A57: vmls.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vmls_n_f32(<2 x float> %a, <2 x float> %b, float %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %b, [[VECINIT1_I]]
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x float> %a, [[MUL_I]]
+// CHECK:   ret <2 x float> [[SUB_I]]
 float32x2_t test_vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c) {
   return vmls_n_f32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsq_n_s16
-// CHECK: vmls.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_n_s16(<8 x i16> %a, <8 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
   return vmlsq_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsq_n_s32
-// CHECK: vmls.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
   return vmlsq_n_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsq_n_u16
-// CHECK: vmls.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_n_u16(<8 x i16> %a, <8 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 uint16x8_t test_vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
   return vmlsq_n_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsq_n_u32
-// CHECK: vmls.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_n_u32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
   return vmlsq_n_u32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsq_n_f32
-// CHECK-SWIFT: vmul.f32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[0]
-// CHECK-SWIFT: vsub.f32
-// CHECK-A57: vmls.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vmlsq_n_f32(<4 x float> %a, <4 x float> %b, float %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %b, [[VECINIT3_I]]
+// CHECK:   [[SUB_I:%.*]] = fsub <4 x float> %a, [[MUL_I]]
+// CHECK:   ret <4 x float> [[SUB_I]]
 float32x4_t test_vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c) {
   return vmlsq_n_f32(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vmovl_s8
-// CHECK: vmovl.s8 q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmovl_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VMOVL_I:%.*]] = sext <8 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[VMOVL_I]]
 int16x8_t test_vmovl_s8(int8x8_t a) {
   return vmovl_s8(a);
 }
 
-// CHECK-LABEL: test_vmovl_s16
-// CHECK: vmovl.s16 q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmovl_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMOVL_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VMOVL_I]]
 int32x4_t test_vmovl_s16(int16x4_t a) {
   return vmovl_s16(a);
 }
 
-// CHECK-LABEL: test_vmovl_s32
-// CHECK: vmovl.s32 q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vmovl_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMOVL_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VMOVL_I]]
 int64x2_t test_vmovl_s32(int32x2_t a) {
   return vmovl_s32(a);
 }
 
-// CHECK-LABEL: test_vmovl_u8
-// CHECK: vmovl.u8 q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmovl_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VMOVL_I:%.*]] = zext <8 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[VMOVL_I]]
 uint16x8_t test_vmovl_u8(uint8x8_t a) {
   return vmovl_u8(a);
 }
 
-// CHECK-LABEL: test_vmovl_u16
-// CHECK: vmovl.u16 q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmovl_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMOVL_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VMOVL_I]]
 uint32x4_t test_vmovl_u16(uint16x4_t a) {
   return vmovl_u16(a);
 }
 
-// CHECK-LABEL: test_vmovl_u32
-// CHECK: vmovl.u32 q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vmovl_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMOVL_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VMOVL_I]]
 uint64x2_t test_vmovl_u32(uint32x2_t a) {
   return vmovl_u32(a);
 }
 
 
-// CHECK-LABEL: test_vmovn_s16
-// CHECK: vmovn.i16 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmovn_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <8 x i16> [[TMP1]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VMOVN_I]]
 int8x8_t test_vmovn_s16(int16x8_t a) {
   return vmovn_s16(a);
 }
 
-// CHECK-LABEL: test_vmovn_s32
-// CHECK: vmovn.i32 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmovn_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <4 x i32> [[TMP1]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VMOVN_I]]
 int16x4_t test_vmovn_s32(int32x4_t a) {
   return vmovn_s32(a);
 }
 
-// CHECK-LABEL: test_vmovn_s64
-// CHECK: vmovn.i64 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmovn_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <2 x i64> [[TMP1]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VMOVN_I]]
 int32x2_t test_vmovn_s64(int64x2_t a) {
   return vmovn_s64(a);
 }
 
-// CHECK-LABEL: test_vmovn_u16
-// CHECK: vmovn.i16 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmovn_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <8 x i16> [[TMP1]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VMOVN_I]]
 uint8x8_t test_vmovn_u16(uint16x8_t a) {
   return vmovn_u16(a);
 }
 
-// CHECK-LABEL: test_vmovn_u32
-// CHECK: vmovn.i32 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmovn_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <4 x i32> [[TMP1]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VMOVN_I]]
 uint16x4_t test_vmovn_u32(uint32x4_t a) {
   return vmovn_u32(a);
 }
 
-// CHECK-LABEL: test_vmovn_u64
-// CHECK: vmovn.i64 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmovn_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <2 x i64> [[TMP1]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VMOVN_I]]
 uint32x2_t test_vmovn_u64(uint64x2_t a) {
   return vmovn_u64(a);
 }
 
 
-// CHECK-LABEL: test_vmov_n_u8
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmov_n_u8(i8 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
+// CHECK:   ret <8 x i8> [[VECINIT7_I]]
 uint8x8_t test_vmov_n_u8(uint8_t a) {
   return vmov_n_u8(a);
 }
 
-// CHECK-LABEL: test_vmov_n_u16
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmov_n_u16(i16 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
+// CHECK:   ret <4 x i16> [[VECINIT3_I]]
 uint16x4_t test_vmov_n_u16(uint16_t a) {
   return vmov_n_u16(a);
 }
 
-// CHECK-LABEL: test_vmov_n_u32
-// CHECK: mov {{r[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmov_n_u32(i32 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1
+// CHECK:   ret <2 x i32> [[VECINIT1_I]]
 uint32x2_t test_vmov_n_u32(uint32_t a) {
   return vmov_n_u32(a);
 }
 
-// CHECK-LABEL: test_vmov_n_s8
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmov_n_s8(i8 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
+// CHECK:   ret <8 x i8> [[VECINIT7_I]]
 int8x8_t test_vmov_n_s8(int8_t a) {
   return vmov_n_s8(a);
 }
 
-// CHECK-LABEL: test_vmov_n_s16
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmov_n_s16(i16 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
+// CHECK:   ret <4 x i16> [[VECINIT3_I]]
 int16x4_t test_vmov_n_s16(int16_t a) {
   return vmov_n_s16(a);
 }
 
-// CHECK-LABEL: test_vmov_n_s32
-// CHECK: mov {{r[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmov_n_s32(i32 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1
+// CHECK:   ret <2 x i32> [[VECINIT1_I]]
 int32x2_t test_vmov_n_s32(int32_t a) {
   return vmov_n_s32(a);
 }
 
-// CHECK-LABEL: test_vmov_n_p8
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmov_n_p8(i8 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
+// CHECK:   ret <8 x i8> [[VECINIT7_I]]
 poly8x8_t test_vmov_n_p8(poly8_t a) {
   return vmov_n_p8(a);
 }
 
-// CHECK-LABEL: test_vmov_n_p16
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmov_n_p16(i16 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
+// CHECK:   ret <4 x i16> [[VECINIT3_I]]
 poly16x4_t test_vmov_n_p16(poly16_t a) {
   return vmov_n_p16(a);
 }
 
-// CHECK-LABEL: test_vmov_n_f16
-// CHECK: vld1.16 {{{d[0-9]+\[\]}}}
+// CHECK-LABEL: define <4 x half> @test_vmov_n_f16(half* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = load half, half* %a, align 2
+// CHECK:   [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP0]], i32 0
+// CHECK:   [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP0]], i32 1
+// CHECK:   [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT]]1, half [[TMP0]], i32 2
+// CHECK:   [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT]]2, half [[TMP0]], i32 3
+// CHECK:   ret <4 x half> [[VECINIT]]3
 float16x4_t test_vmov_n_f16(float16_t *a) {
   return vmov_n_f16(*a);
 }
 
-// CHECK-LABEL: test_vmov_n_f32
-// CHECK: mov {{r[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vmov_n_f32(float %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %a, i32 1
+// CHECK:   ret <2 x float> [[VECINIT1_I]]
 float32x2_t test_vmov_n_f32(float32_t a) {
   return vmov_n_f32(a);
 }
 
-// CHECK-LABEL: test_vmovq_n_u8
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vmovq_n_u8(i8 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
+// CHECK:   [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
+// CHECK:   [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
+// CHECK:   [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
+// CHECK:   [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
+// CHECK:   [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
+// CHECK:   [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
+// CHECK:   [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
+// CHECK:   [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
+// CHECK:   ret <16 x i8> [[VECINIT15_I]]
 uint8x16_t test_vmovq_n_u8(uint8_t a) {
   return vmovq_n_u8(a);
 }
 
-// CHECK-LABEL: test_vmovq_n_u16
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmovq_n_u16(i16 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
+// CHECK:   ret <8 x i16> [[VECINIT7_I]]
 uint16x8_t test_vmovq_n_u16(uint16_t a) {
   return vmovq_n_u16(a);
 }
 
-// CHECK-LABEL: test_vmovq_n_u32
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmovq_n_u32(i32 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %a, i32 3
+// CHECK:   ret <4 x i32> [[VECINIT3_I]]
 uint32x4_t test_vmovq_n_u32(uint32_t a) {
   return vmovq_n_u32(a);
 }
 
-// CHECK-LABEL: test_vmovq_n_s8
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vmovq_n_s8(i8 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
+// CHECK:   [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
+// CHECK:   [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
+// CHECK:   [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
+// CHECK:   [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
+// CHECK:   [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
+// CHECK:   [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
+// CHECK:   [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
+// CHECK:   [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
+// CHECK:   ret <16 x i8> [[VECINIT15_I]]
 int8x16_t test_vmovq_n_s8(int8_t a) {
   return vmovq_n_s8(a);
 }
 
-// CHECK-LABEL: test_vmovq_n_s16
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmovq_n_s16(i16 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
+// CHECK:   ret <8 x i16> [[VECINIT7_I]]
 int16x8_t test_vmovq_n_s16(int16_t a) {
   return vmovq_n_s16(a);
 }
 
-// CHECK-LABEL: test_vmovq_n_s32
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmovq_n_s32(i32 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %a, i32 3
+// CHECK:   ret <4 x i32> [[VECINIT3_I]]
 int32x4_t test_vmovq_n_s32(int32_t a) {
   return vmovq_n_s32(a);
 }
 
-// CHECK-LABEL: test_vmovq_n_p8
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vmovq_n_p8(i8 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
+// CHECK:   [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
+// CHECK:   [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
+// CHECK:   [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
+// CHECK:   [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
+// CHECK:   [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
+// CHECK:   [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
+// CHECK:   [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
+// CHECK:   [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
+// CHECK:   ret <16 x i8> [[VECINIT15_I]]
 poly8x16_t test_vmovq_n_p8(poly8_t a) {
   return vmovq_n_p8(a);
 }
 
-// CHECK-LABEL: test_vmovq_n_p16
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmovq_n_p16(i16 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
+// CHECK:   ret <8 x i16> [[VECINIT7_I]]
 poly16x8_t test_vmovq_n_p16(poly16_t a) {
   return vmovq_n_p16(a);
 }
 
-// CHECK-LABEL: test_vmovq_n_f16
-// CHECK: vld1.16 {{{d[0-9]+\[\], d[0-9]+\[\]}}}
+// CHECK-LABEL: define <8 x half> @test_vmovq_n_f16(half* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = load half, half* %a, align 2
+// CHECK:   [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP0]], i32 0
+// CHECK:   [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP0]], i32 1
+// CHECK:   [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT]]1, half [[TMP0]], i32 2
+// CHECK:   [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT]]2, half [[TMP0]], i32 3
+// CHECK:   [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT]]3, half [[TMP0]], i32 4
+// CHECK:   [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT]]4, half [[TMP0]], i32 5
+// CHECK:   [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT]]5, half [[TMP0]], i32 6
+// CHECK:   [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT]]6, half [[TMP0]], i32 7
+// CHECK:   ret <8 x half> [[VECINIT]]7
 float16x8_t test_vmovq_n_f16(float16_t *a) {
   return vmovq_n_f16(*a);
 }
 
-// CHECK-LABEL: test_vmovq_n_f32
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vmovq_n_f32(float %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %a, i32 3
+// CHECK:   ret <4 x float> [[VECINIT3_I]]
 float32x4_t test_vmovq_n_f32(float32_t a) {
   return vmovq_n_f32(a);
 }
 
-// CHECK-LABEL: test_vmov_n_s64
-// CHECK: vmov.32 [[REG:d[0-9]+]][0], r0
-// CHECK: vmov.32 [[REG]][1], r1
+// CHECK-LABEL: define <1 x i64> @test_vmov_n_s64(i64 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0
+// CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]]
+// CHECK:   ret <1 x i64> [[ADD_I]]
 int64x1_t test_vmov_n_s64(int64_t a) {
   int64x1_t tmp = vmov_n_s64(a);
   return vadd_s64(tmp, tmp);
 }
 
-// CHECK-LABEL: test_vmov_n_u64
-// CHECK: vmov.32 [[REG:d[0-9]+]][0], r0
-// CHECK: vmov.32 [[REG]][1], r1
+// CHECK-LABEL: define <1 x i64> @test_vmov_n_u64(i64 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0
+// CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]]
+// CHECK:   ret <1 x i64> [[ADD_I]]
 uint64x1_t test_vmov_n_u64(uint64_t a) {
   uint64x1_t tmp = vmov_n_u64(a);
   return vadd_u64(tmp, tmp);
 }
 
-// CHECK-LABEL: test_vmovq_n_s64
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vmovq_n_s64(i64 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
+// CHECK:   ret <2 x i64> [[VECINIT1_I]]
 int64x2_t test_vmovq_n_s64(int64_t a) {
   return vmovq_n_s64(a);
 }
 
-// CHECK-LABEL: test_vmovq_n_u64
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vmovq_n_u64(i64 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
+// CHECK:   ret <2 x i64> [[VECINIT1_I]]
 uint64x2_t test_vmovq_n_u64(uint64_t a) {
   return vmovq_n_u64(a);
 }
 
 
-// CHECK-LABEL: test_vmul_s8
-// CHECK: vmul.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmul_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[MUL_I]]
 int8x8_t test_vmul_s8(int8x8_t a, int8x8_t b) {
   return vmul_s8(a, b);
 }
 
-// CHECK-LABEL: test_vmul_s16
-// CHECK: vmul.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmul_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[MUL_I]]
 int16x4_t test_vmul_s16(int16x4_t a, int16x4_t b) {
   return vmul_s16(a, b);
 }
 
-// CHECK-LABEL: test_vmul_s32
-// CHECK: vmul.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmul_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[MUL_I]]
 int32x2_t test_vmul_s32(int32x2_t a, int32x2_t b) {
   return vmul_s32(a, b);
 }
 
-// CHECK-LABEL: test_vmul_f32
-// CHECK: vmul.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vmul_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %a, %b
+// CHECK:   ret <2 x float> [[MUL_I]]
 float32x2_t test_vmul_f32(float32x2_t a, float32x2_t b) {
   return vmul_f32(a, b);
 }
 
-// CHECK-LABEL: test_vmul_u8
-// CHECK: vmul.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmul_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[MUL_I]]
 uint8x8_t test_vmul_u8(uint8x8_t a, uint8x8_t b) {
   return vmul_u8(a, b);
 }
 
-// CHECK-LABEL: test_vmul_u16
-// CHECK: vmul.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmul_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[MUL_I]]
 uint16x4_t test_vmul_u16(uint16x4_t a, uint16x4_t b) {
   return vmul_u16(a, b);
 }
 
-// CHECK-LABEL: test_vmul_u32
-// CHECK: vmul.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmul_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[MUL_I]]
 uint32x2_t test_vmul_u32(uint32x2_t a, uint32x2_t b) {
   return vmul_u32(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_s8
-// CHECK: vmul.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vmulq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[MUL_I]]
 int8x16_t test_vmulq_s8(int8x16_t a, int8x16_t b) {
   return vmulq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_s16
-// CHECK: vmul.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmulq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[MUL_I]]
 int16x8_t test_vmulq_s16(int16x8_t a, int16x8_t b) {
   return vmulq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_s32
-// CHECK: vmul.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmulq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[MUL_I]]
 int32x4_t test_vmulq_s32(int32x4_t a, int32x4_t b) {
   return vmulq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_f32
-// CHECK: vmul.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vmulq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %a, %b
+// CHECK:   ret <4 x float> [[MUL_I]]
 float32x4_t test_vmulq_f32(float32x4_t a, float32x4_t b) {
   return vmulq_f32(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_u8
-// CHECK: vmul.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vmulq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[MUL_I]]
 uint8x16_t test_vmulq_u8(uint8x16_t a, uint8x16_t b) {
   return vmulq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_u16
-// CHECK: vmul.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmulq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[MUL_I]]
 uint16x8_t test_vmulq_u16(uint16x8_t a, uint16x8_t b) {
   return vmulq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_u32
-// CHECK: vmul.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmulq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[MUL_I]]
 uint32x4_t test_vmulq_u32(uint32x4_t a, uint32x4_t b) {
   return vmulq_u32(a, b);
 }
 
 
-// CHECK-LABEL: test_vmull_s8
-// CHECK: vmull.s8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmull_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMULL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i16> [[VMULL_I]]
 int16x8_t test_vmull_s8(int8x8_t a, int8x8_t b) {
   return vmull_s8(a, b);
 }
 
-// CHECK-LABEL: test_vmull_s16
-// CHECK: vmull.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmull_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_s16(int16x4_t a, int16x4_t b) {
   return vmull_s16(a, b);
 }
 
-// CHECK-LABEL: test_vmull_s32
-// CHECK: vmull.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vmull_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_s32(int32x2_t a, int32x2_t b) {
   return vmull_s32(a, b);
 }
 
-// CHECK-LABEL: test_vmull_u8
-// CHECK: vmull.u8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmull_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMULL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i16> [[VMULL_I]]
 uint16x8_t test_vmull_u8(uint8x8_t a, uint8x8_t b) {
   return vmull_u8(a, b);
 }
 
-// CHECK-LABEL: test_vmull_u16
-// CHECK: vmull.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmull_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_u16(uint16x4_t a, uint16x4_t b) {
   return vmull_u16(a, b);
 }
 
-// CHECK-LABEL: test_vmull_u32
-// CHECK: vmull.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vmull_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_u32(uint32x2_t a, uint32x2_t b) {
   return vmull_u32(a, b);
 }
 
-// CHECK-LABEL: test_vmull_p8
-// CHECK: vmull.p8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmull_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMULL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i16> [[VMULL_I]]
 poly16x8_t test_vmull_p8(poly8x8_t a, poly8x8_t b) {
   return vmull_p8(a, b);
 }
 
 
-// CHECK-LABEL: test_vmull_lane_s16
-// CHECK: vmull.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vmull_lane_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t b) {
   return vmull_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vmull_lane_s32
-// CHECK: vmull.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i64> @test_vmull_lane_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t b) {
   return vmull_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vmull_lane_u16
-// CHECK: vmull.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vmull_lane_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t b) {
   return vmull_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vmull_lane_u32
-// CHECK: vmull.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i64> @test_vmull_lane_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t b) {
   return vmull_lane_u32(a, b, 1);
 }
 
 
-// CHECK-LABEL: test_vmull_n_s16
-// CHECK: vmull.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmull_n_s16(<4 x i16> %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL4_I]]) #4
+// CHECK:   ret <4 x i32> [[VMULL5_I]]
 int32x4_t test_vmull_n_s16(int16x4_t a, int16_t b) {
   return vmull_n_s16(a, b);
 }
 
-// CHECK-LABEL: test_vmull_n_s32
-// CHECK: vmull.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vmull_n_s32(<2 x i32> %a, i32 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL2_I]]) #4
+// CHECK:   ret <2 x i64> [[VMULL3_I]]
 int64x2_t test_vmull_n_s32(int32x2_t a, int32_t b) {
   return vmull_n_s32(a, b);
 }
 
-// CHECK-LABEL: test_vmull_n_u16
-// CHECK: vmull.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmull_n_u16(<4 x i16> %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL4_I]]) #4
+// CHECK:   ret <4 x i32> [[VMULL5_I]]
 uint32x4_t test_vmull_n_u16(uint16x4_t a, uint16_t b) {
   return vmull_n_u16(a, b);
 }
 
-// CHECK-LABEL: test_vmull_n_u32
-// CHECK: vmull.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vmull_n_u32(<2 x i32> %a, i32 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL2_I]]) #4
+// CHECK:   ret <2 x i64> [[VMULL3_I]]
 uint64x2_t test_vmull_n_u32(uint32x2_t a, uint32_t b) {
   return vmull_n_u32(a, b);
 }
 
 
-// CHECK-LABEL: test_vmul_p8
-// CHECK: vmul.p8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmul_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMUL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.pmul.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VMUL_V_I]]
 poly8x8_t test_vmul_p8(poly8x8_t a, poly8x8_t b) {
   return vmul_p8(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_p8
-// CHECK: vmul.p8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vmulq_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VMULQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.pmul.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VMULQ_V_I]]
 poly8x16_t test_vmulq_p8(poly8x16_t a, poly8x16_t b) {
   return vmulq_p8(a, b);
 }
 
 
-// CHECK-LABEL: test_vmul_lane_s16
-// CHECK: vmul.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i16> @test_vmul_lane_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i16> [[MUL]]
 int16x4_t test_vmul_lane_s16(int16x4_t a, int16x4_t b) {
   return vmul_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vmul_lane_s32
-// CHECK: vmul.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i32> @test_vmul_lane_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x i32> [[MUL]]
 int32x2_t test_vmul_lane_s32(int32x2_t a, int32x2_t b) {
   return vmul_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vmul_lane_f32
-// CHECK: vmul.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x float> @test_vmul_lane_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %b, <2 x float> %b, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x float> [[MUL]]
 float32x2_t test_vmul_lane_f32(float32x2_t a, float32x2_t b) {
   return vmul_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vmul_lane_u16
-// CHECK: vmul.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i16> @test_vmul_lane_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i16> [[MUL]]
 uint16x4_t test_vmul_lane_u16(uint16x4_t a, uint16x4_t b) {
   return vmul_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vmul_lane_u32
-// CHECK: vmul.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i32> @test_vmul_lane_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x i32> [[MUL]]
 uint32x2_t test_vmul_lane_u32(uint32x2_t a, uint32x2_t b) {
   return vmul_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vmulq_lane_s16
-// CHECK: vmul.i16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <8 x i16> @test_vmulq_lane_s16(<8 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <8 x i16> [[MUL]]
 int16x8_t test_vmulq_lane_s16(int16x8_t a, int16x4_t b) {
   return vmulq_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vmulq_lane_s32
-// CHECK: vmul.i32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vmulq_lane_s32(<4 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i32> [[MUL]]
 int32x4_t test_vmulq_lane_s32(int32x4_t a, int32x2_t b) {
   return vmulq_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vmulq_lane_f32
-// CHECK: vmul.f32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x float> @test_vmulq_lane_f32(<4 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %b, <2 x float> %b, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x float> [[MUL]]
 float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t b) {
   return vmulq_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vmulq_lane_u16
-// CHECK: vmul.i16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <8 x i16> @test_vmulq_lane_u16(<8 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <8 x i16> [[MUL]]
 uint16x8_t test_vmulq_lane_u16(uint16x8_t a, uint16x4_t b) {
   return vmulq_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vmulq_lane_u32
-// CHECK: vmul.i32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vmulq_lane_u32(<4 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i32> [[MUL]]
 uint32x4_t test_vmulq_lane_u32(uint32x4_t a, uint32x2_t b) {
   return vmulq_lane_u32(a, b, 1);
 }
 
 
-// CHECK-LABEL: test_vmul_n_s16
-// CHECK: vmul.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmul_n_s16(<4 x i16> %a, i16 %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %a, [[VECINIT3_I]]
+// CHECK:   ret <4 x i16> [[MUL_I]]
 int16x4_t test_vmul_n_s16(int16x4_t a, int16_t b) {
   return vmul_n_s16(a, b);
 }
 
-// CHECK-LABEL: test_vmul_n_s32
-// CHECK: vmul.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmul_n_s32(<2 x i32> %a, i32 %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]]
+// CHECK:   ret <2 x i32> [[MUL_I]]
 int32x2_t test_vmul_n_s32(int32x2_t a, int32_t b) {
   return vmul_n_s32(a, b);
 }
 
-// CHECK-LABEL: test_vmul_n_f32
-// CHECK: vmul.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vmul_n_f32(<2 x float> %a, float %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %b, i32 1
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %a, [[VECINIT1_I]]
+// CHECK:   ret <2 x float> [[MUL_I]]
 float32x2_t test_vmul_n_f32(float32x2_t a, float32_t b) {
   return vmul_n_f32(a, b);
 }
 
-// CHECK-LABEL: test_vmul_n_u16
-// CHECK: vmul.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmul_n_u16(<4 x i16> %a, i16 %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %a, [[VECINIT3_I]]
+// CHECK:   ret <4 x i16> [[MUL_I]]
 uint16x4_t test_vmul_n_u16(uint16x4_t a, uint16_t b) {
   return vmul_n_u16(a, b);
 }
 
-// CHECK-LABEL: test_vmul_n_u32
-// CHECK: vmul.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmul_n_u32(<2 x i32> %a, i32 %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]]
+// CHECK:   ret <2 x i32> [[MUL_I]]
 uint32x2_t test_vmul_n_u32(uint32x2_t a, uint32_t b) {
   return vmul_n_u32(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_n_s16
-// CHECK: vmul.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmulq_n_s16(<8 x i16> %a, i16 %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %a, [[VECINIT7_I]]
+// CHECK:   ret <8 x i16> [[MUL_I]]
 int16x8_t test_vmulq_n_s16(int16x8_t a, int16_t b) {
   return vmulq_n_s16(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_n_s32
-// CHECK: vmul.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmulq_n_s32(<4 x i32> %a, i32 %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %a, [[VECINIT3_I]]
+// CHECK:   ret <4 x i32> [[MUL_I]]
 int32x4_t test_vmulq_n_s32(int32x4_t a, int32_t b) {
   return vmulq_n_s32(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_n_f32
-// CHECK: vmul.f32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[0]
+// CHECK-LABEL: define <4 x float> @test_vmulq_n_f32(<4 x float> %a, float %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %b, i32 3
+// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %a, [[VECINIT3_I]]
+// CHECK:   ret <4 x float> [[MUL_I]]
 float32x4_t test_vmulq_n_f32(float32x4_t a, float32_t b) {
   return vmulq_n_f32(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_n_u16
-// CHECK: vmul.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmulq_n_u16(<8 x i16> %a, i16 %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %a, [[VECINIT7_I]]
+// CHECK:   ret <8 x i16> [[MUL_I]]
 uint16x8_t test_vmulq_n_u16(uint16x8_t a, uint16_t b) {
   return vmulq_n_u16(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_n_u32
-// CHECK: vmul.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmulq_n_u32(<4 x i32> %a, i32 %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %a, [[VECINIT3_I]]
+// CHECK:   ret <4 x i32> [[MUL_I]]
 uint32x4_t test_vmulq_n_u32(uint32x4_t a, uint32_t b) {
   return vmulq_n_u32(a, b);
 }
 
 
-// CHECK-LABEL: test_vmvn_s8
-// CHECK: vmvn d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmvn_s8(<8 x i8> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   ret <8 x i8> [[NEG_I]]
 int8x8_t test_vmvn_s8(int8x8_t a) {
   return vmvn_s8(a);
 }
 
-// CHECK-LABEL: test_vmvn_s16
-// CHECK: vmvn d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmvn_s16(<4 x i16> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   ret <4 x i16> [[NEG_I]]
 int16x4_t test_vmvn_s16(int16x4_t a) {
   return vmvn_s16(a);
 }
 
-// CHECK-LABEL: test_vmvn_s32
-// CHECK: vmvn d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmvn_s32(<2 x i32> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %a, <i32 -1, i32 -1>
+// CHECK:   ret <2 x i32> [[NEG_I]]
 int32x2_t test_vmvn_s32(int32x2_t a) {
   return vmvn_s32(a);
 }
 
-// CHECK-LABEL: test_vmvn_u8
-// CHECK: vmvn d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmvn_u8(<8 x i8> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   ret <8 x i8> [[NEG_I]]
 uint8x8_t test_vmvn_u8(uint8x8_t a) {
   return vmvn_u8(a);
 }
 
-// CHECK-LABEL: test_vmvn_u16
-// CHECK: vmvn d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmvn_u16(<4 x i16> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   ret <4 x i16> [[NEG_I]]
 uint16x4_t test_vmvn_u16(uint16x4_t a) {
   return vmvn_u16(a);
 }
 
-// CHECK-LABEL: test_vmvn_u32
-// CHECK: vmvn d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmvn_u32(<2 x i32> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %a, <i32 -1, i32 -1>
+// CHECK:   ret <2 x i32> [[NEG_I]]
 uint32x2_t test_vmvn_u32(uint32x2_t a) {
   return vmvn_u32(a);
 }
 
-// CHECK-LABEL: test_vmvn_p8
-// CHECK: vmvn d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmvn_p8(<8 x i8> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   ret <8 x i8> [[NEG_I]]
 poly8x8_t test_vmvn_p8(poly8x8_t a) {
   return vmvn_p8(a);
 }
 
-// CHECK-LABEL: test_vmvnq_s8
-// CHECK: vmvn q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vmvnq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   ret <16 x i8> [[NEG_I]]
 int8x16_t test_vmvnq_s8(int8x16_t a) {
   return vmvnq_s8(a);
 }
 
-// CHECK-LABEL: test_vmvnq_s16
-// CHECK: vmvn q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmvnq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   ret <8 x i16> [[NEG_I]]
 int16x8_t test_vmvnq_s16(int16x8_t a) {
   return vmvnq_s16(a);
 }
 
-// CHECK-LABEL: test_vmvnq_s32
-// CHECK: vmvn q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmvnq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   ret <4 x i32> [[NEG_I]]
 int32x4_t test_vmvnq_s32(int32x4_t a) {
   return vmvnq_s32(a);
 }
 
-// CHECK-LABEL: test_vmvnq_u8
-// CHECK: vmvn q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vmvnq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   ret <16 x i8> [[NEG_I]]
 uint8x16_t test_vmvnq_u8(uint8x16_t a) {
   return vmvnq_u8(a);
 }
 
-// CHECK-LABEL: test_vmvnq_u16
-// CHECK: vmvn q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmvnq_u16(<8 x i16> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   ret <8 x i16> [[NEG_I]]
 uint16x8_t test_vmvnq_u16(uint16x8_t a) {
   return vmvnq_u16(a);
 }
 
-// CHECK-LABEL: test_vmvnq_u32
-// CHECK: vmvn q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmvnq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   ret <4 x i32> [[NEG_I]]
 uint32x4_t test_vmvnq_u32(uint32x4_t a) {
   return vmvnq_u32(a);
 }
 
-// CHECK-LABEL: test_vmvnq_p8
-// CHECK: vmvn q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vmvnq_p8(<16 x i8> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   ret <16 x i8> [[NEG_I]]
 poly8x16_t test_vmvnq_p8(poly8x16_t a) {
   return vmvnq_p8(a);
 }
 
 
-// CHECK-LABEL: test_vneg_s8
-// CHECK: vneg.s8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vneg_s8(<8 x i8> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i8> zeroinitializer, %a
+// CHECK:   ret <8 x i8> [[SUB_I]]
 int8x8_t test_vneg_s8(int8x8_t a) {
   return vneg_s8(a);
 }
 
-// CHECK-LABEL: test_vneg_s16
-// CHECK: vneg.s16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vneg_s16(<4 x i16> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> zeroinitializer, %a
+// CHECK:   ret <4 x i16> [[SUB_I]]
 int16x4_t test_vneg_s16(int16x4_t a) {
   return vneg_s16(a);
 }
 
-// CHECK-LABEL: test_vneg_s32
-// CHECK: vneg.s32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vneg_s32(<2 x i32> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> zeroinitializer, %a
+// CHECK:   ret <2 x i32> [[SUB_I]]
 int32x2_t test_vneg_s32(int32x2_t a) {
   return vneg_s32(a);
 }
 
-// CHECK-LABEL: test_vneg_f32
-// CHECK: vneg.f32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vneg_f32(<2 x float> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %a
+// CHECK:   ret <2 x float> [[SUB_I]]
 float32x2_t test_vneg_f32(float32x2_t a) {
   return vneg_f32(a);
 }
 
-// CHECK-LABEL: test_vnegq_s8
-// CHECK: vneg.s8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vnegq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <16 x i8> zeroinitializer, %a
+// CHECK:   ret <16 x i8> [[SUB_I]]
 int8x16_t test_vnegq_s8(int8x16_t a) {
   return vnegq_s8(a);
 }
 
-// CHECK-LABEL: test_vnegq_s16
-// CHECK: vneg.s16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vnegq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> zeroinitializer, %a
+// CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vnegq_s16(int16x8_t a) {
   return vnegq_s16(a);
 }
 
-// CHECK-LABEL: test_vnegq_s32
-// CHECK: vneg.s32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vnegq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> zeroinitializer, %a
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vnegq_s32(int32x4_t a) {
   return vnegq_s32(a);
 }
 
-// CHECK-LABEL: test_vnegq_f32
-// CHECK: vneg.f32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vnegq_f32(<4 x float> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
+// CHECK:   ret <4 x float> [[SUB_I]]
 float32x4_t test_vnegq_f32(float32x4_t a) {
   return vnegq_f32(a);
 }
 
 
-// CHECK-LABEL: test_vorn_s8
-// CHECK: vorn d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vorn_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, [[NEG_I]]
+// CHECK:   ret <8 x i8> [[OR_I]]
 int8x8_t test_vorn_s8(int8x8_t a, int8x8_t b) {
   return vorn_s8(a, b);
 }
 
-// CHECK-LABEL: test_vorn_s16
-// CHECK: vorn d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vorn_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, [[NEG_I]]
+// CHECK:   ret <4 x i16> [[OR_I]]
 int16x4_t test_vorn_s16(int16x4_t a, int16x4_t b) {
   return vorn_s16(a, b);
 }
 
-// CHECK-LABEL: test_vorn_s32
-// CHECK: vorn d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vorn_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
+// CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, [[NEG_I]]
+// CHECK:   ret <2 x i32> [[OR_I]]
 int32x2_t test_vorn_s32(int32x2_t a, int32x2_t b) {
   return vorn_s32(a, b);
 }
 
-// CHECK-LABEL: test_vorn_s64
-// CHECK: vorn d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vorn_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
+// CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, [[NEG_I]]
+// CHECK:   ret <1 x i64> [[OR_I]]
 int64x1_t test_vorn_s64(int64x1_t a, int64x1_t b) {
   return vorn_s64(a, b);
 }
 
-// CHECK-LABEL: test_vorn_u8
-// CHECK: vorn d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vorn_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, [[NEG_I]]
+// CHECK:   ret <8 x i8> [[OR_I]]
 uint8x8_t test_vorn_u8(uint8x8_t a, uint8x8_t b) {
   return vorn_u8(a, b);
 }
 
-// CHECK-LABEL: test_vorn_u16
-// CHECK: vorn d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vorn_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, [[NEG_I]]
+// CHECK:   ret <4 x i16> [[OR_I]]
 uint16x4_t test_vorn_u16(uint16x4_t a, uint16x4_t b) {
   return vorn_u16(a, b);
 }
 
-// CHECK-LABEL: test_vorn_u32
-// CHECK: vorn d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vorn_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
+// CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, [[NEG_I]]
+// CHECK:   ret <2 x i32> [[OR_I]]
 uint32x2_t test_vorn_u32(uint32x2_t a, uint32x2_t b) {
   return vorn_u32(a, b);
 }
 
-// CHECK-LABEL: test_vorn_u64
-// CHECK: vorn d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vorn_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
+// CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, [[NEG_I]]
+// CHECK:   ret <1 x i64> [[OR_I]]
 uint64x1_t test_vorn_u64(uint64x1_t a, uint64x1_t b) {
   return vorn_u64(a, b);
 }
 
-// CHECK-LABEL: test_vornq_s8
-// CHECK: vorn q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vornq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, [[NEG_I]]
+// CHECK:   ret <16 x i8> [[OR_I]]
 int8x16_t test_vornq_s8(int8x16_t a, int8x16_t b) {
   return vornq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vornq_s16
-// CHECK: vorn q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vornq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, [[NEG_I]]
+// CHECK:   ret <8 x i16> [[OR_I]]
 int16x8_t test_vornq_s16(int16x8_t a, int16x8_t b) {
   return vornq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vornq_s32
-// CHECK: vorn q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vornq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, [[NEG_I]]
+// CHECK:   ret <4 x i32> [[OR_I]]
 int32x4_t test_vornq_s32(int32x4_t a, int32x4_t b) {
   return vornq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vornq_s64
-// CHECK: vorn q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vornq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
+// CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, [[NEG_I]]
+// CHECK:   ret <2 x i64> [[OR_I]]
 int64x2_t test_vornq_s64(int64x2_t a, int64x2_t b) {
   return vornq_s64(a, b);
 }
 
-// CHECK-LABEL: test_vornq_u8
-// CHECK: vorn q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vornq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, [[NEG_I]]
+// CHECK:   ret <16 x i8> [[OR_I]]
 uint8x16_t test_vornq_u8(uint8x16_t a, uint8x16_t b) {
   return vornq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vornq_u16
-// CHECK: vorn q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vornq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, [[NEG_I]]
+// CHECK:   ret <8 x i16> [[OR_I]]
 uint16x8_t test_vornq_u16(uint16x8_t a, uint16x8_t b) {
   return vornq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vornq_u32
-// CHECK: vorn q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vornq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, [[NEG_I]]
+// CHECK:   ret <4 x i32> [[OR_I]]
 uint32x4_t test_vornq_u32(uint32x4_t a, uint32x4_t b) {
   return vornq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vornq_u64
-// CHECK: vorn q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vornq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
+// CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, [[NEG_I]]
+// CHECK:   ret <2 x i64> [[OR_I]]
 uint64x2_t test_vornq_u64(uint64x2_t a, uint64x2_t b) {
   return vornq_u64(a, b);
 }
 
 
-// CHECK-LABEL: test_vorr_s8
-// CHECK: vorr d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vorr_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[OR_I]]
 int8x8_t test_vorr_s8(int8x8_t a, int8x8_t b) {
   return vorr_s8(a, b);
 }
 
-// CHECK-LABEL: test_vorr_s16
-// CHECK: vorr d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vorr_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[OR_I]]
 int16x4_t test_vorr_s16(int16x4_t a, int16x4_t b) {
   return vorr_s16(a, b);
 }
 
-// CHECK-LABEL: test_vorr_s32
-// CHECK: vorr d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vorr_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[OR_I]]
 int32x2_t test_vorr_s32(int32x2_t a, int32x2_t b) {
   return vorr_s32(a, b);
 }
 
-// CHECK-LABEL: test_vorr_s64
-// CHECK: vorr d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vorr_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[OR_I]]
 int64x1_t test_vorr_s64(int64x1_t a, int64x1_t b) {
   return vorr_s64(a, b);
 }
 
-// CHECK-LABEL: test_vorr_u8
-// CHECK: vorr d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vorr_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[OR_I]]
 uint8x8_t test_vorr_u8(uint8x8_t a, uint8x8_t b) {
   return vorr_u8(a, b);
 }
 
-// CHECK-LABEL: test_vorr_u16
-// CHECK: vorr d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vorr_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[OR_I]]
 uint16x4_t test_vorr_u16(uint16x4_t a, uint16x4_t b) {
   return vorr_u16(a, b);
 }
 
-// CHECK-LABEL: test_vorr_u32
-// CHECK: vorr d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vorr_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[OR_I]]
 uint32x2_t test_vorr_u32(uint32x2_t a, uint32x2_t b) {
   return vorr_u32(a, b);
 }
 
-// CHECK-LABEL: test_vorr_u64
-// CHECK: vorr d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vorr_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[OR_I]]
 uint64x1_t test_vorr_u64(uint64x1_t a, uint64x1_t b) {
   return vorr_u64(a, b);
 }
 
-// CHECK-LABEL: test_vorrq_s8
-// CHECK: vorr q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vorrq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[OR_I]]
 int8x16_t test_vorrq_s8(int8x16_t a, int8x16_t b) {
   return vorrq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vorrq_s16
-// CHECK: vorr q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vorrq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[OR_I]]
 int16x8_t test_vorrq_s16(int16x8_t a, int16x8_t b) {
   return vorrq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vorrq_s32
-// CHECK: vorr q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vorrq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[OR_I]]
 int32x4_t test_vorrq_s32(int32x4_t a, int32x4_t b) {
   return vorrq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vorrq_s64
-// CHECK: vorr q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vorrq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[OR_I]]
 int64x2_t test_vorrq_s64(int64x2_t a, int64x2_t b) {
   return vorrq_s64(a, b);
 }
 
-// CHECK-LABEL: test_vorrq_u8
-// CHECK: vorr q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vorrq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[OR_I]]
 uint8x16_t test_vorrq_u8(uint8x16_t a, uint8x16_t b) {
   return vorrq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vorrq_u16
-// CHECK: vorr q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vorrq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[OR_I]]
 uint16x8_t test_vorrq_u16(uint16x8_t a, uint16x8_t b) {
   return vorrq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vorrq_u32
-// CHECK: vorr q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vorrq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[OR_I]]
 uint32x4_t test_vorrq_u32(uint32x4_t a, uint32x4_t b) {
   return vorrq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vorrq_u64
-// CHECK: vorr q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vorrq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[OR_I]]
 uint64x2_t test_vorrq_u64(uint64x2_t a, uint64x2_t b) {
   return vorrq_u64(a, b);
 }
 
 
-// CHECK-LABEL: test_vpadal_s8
-// CHECK: vpadal.s8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vpadal_s8(<4 x i16> %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %b) #4
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = add <4 x i16> [[VPADAL_I]], [[TMP1]]
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vpadal_s8(int16x4_t a, int8x8_t b) {
   return vpadal_s8(a, b);
 }
 
-// CHECK-LABEL: test_vpadal_s16
-// CHECK: vpadal.s16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vpadal_s16(<2 x i32> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VPADAL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> [[VPADAL_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = add <2 x i32> [[VPADAL1_I]], [[TMP2]]
+// CHECK:   ret <2 x i32> [[TMP3]]
 int32x2_t test_vpadal_s16(int32x2_t a, int16x4_t b) {
   return vpadal_s16(a, b);
 }
 
-// CHECK-LABEL: test_vpadal_s32
-// CHECK: vpadal.s32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vpadal_s32(<1 x i64> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VPADAL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> [[VPADAL_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = add <1 x i64> [[VPADAL1_I]], [[TMP2]]
+// CHECK:   ret <1 x i64> [[TMP3]]
 int64x1_t test_vpadal_s32(int64x1_t a, int32x2_t b) {
   return vpadal_s32(a, b);
 }
 
-// CHECK-LABEL: test_vpadal_u8
-// CHECK: vpadal.u8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vpadal_u8(<4 x i16> %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %b) #4
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = add <4 x i16> [[VPADAL_I]], [[TMP1]]
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vpadal_u8(uint16x4_t a, uint8x8_t b) {
   return vpadal_u8(a, b);
 }
 
-// CHECK-LABEL: test_vpadal_u16
-// CHECK: vpadal.u16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vpadal_u16(<2 x i32> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VPADAL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> [[VPADAL_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = add <2 x i32> [[VPADAL1_I]], [[TMP2]]
+// CHECK:   ret <2 x i32> [[TMP3]]
 uint32x2_t test_vpadal_u16(uint32x2_t a, uint16x4_t b) {
   return vpadal_u16(a, b);
 }
 
-// CHECK-LABEL: test_vpadal_u32
-// CHECK: vpadal.u32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vpadal_u32(<1 x i64> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VPADAL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> [[VPADAL_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = add <1 x i64> [[VPADAL1_I]], [[TMP2]]
+// CHECK:   ret <1 x i64> [[TMP3]]
 uint64x1_t test_vpadal_u32(uint64x1_t a, uint32x2_t b) {
   return vpadal_u32(a, b);
 }
 
-// CHECK-LABEL: test_vpadalq_s8
-// CHECK: vpadal.s8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vpadalq_s8(<8 x i16> %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %b) #4
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = add <8 x i16> [[VPADAL_I]], [[TMP1]]
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vpadalq_s8(int16x8_t a, int8x16_t b) {
   return vpadalq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vpadalq_s16
-// CHECK: vpadal.s16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vpadalq_s16(<4 x i32> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VPADAL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> [[VPADAL_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = add <4 x i32> [[VPADAL1_I]], [[TMP2]]
+// CHECK:   ret <4 x i32> [[TMP3]]
 int32x4_t test_vpadalq_s16(int32x4_t a, int16x8_t b) {
   return vpadalq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vpadalq_s32
-// CHECK: vpadal.s32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vpadalq_s32(<2 x i64> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VPADAL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> [[VPADAL_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = add <2 x i64> [[VPADAL1_I]], [[TMP2]]
+// CHECK:   ret <2 x i64> [[TMP3]]
 int64x2_t test_vpadalq_s32(int64x2_t a, int32x4_t b) {
   return vpadalq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vpadalq_u8
-// CHECK: vpadal.u8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vpadalq_u8(<8 x i16> %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %b) #4
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = add <8 x i16> [[VPADAL_I]], [[TMP1]]
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vpadalq_u8(uint16x8_t a, uint8x16_t b) {
   return vpadalq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vpadalq_u16
-// CHECK: vpadal.u16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vpadalq_u16(<4 x i32> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VPADAL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> [[VPADAL_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = add <4 x i32> [[VPADAL1_I]], [[TMP2]]
+// CHECK:   ret <4 x i32> [[TMP3]]
 uint32x4_t test_vpadalq_u16(uint32x4_t a, uint16x8_t b) {
   return vpadalq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vpadalq_u32
-// CHECK: vpadal.u32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vpadalq_u32(<2 x i64> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VPADAL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> [[VPADAL_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = add <2 x i64> [[VPADAL1_I]], [[TMP2]]
+// CHECK:   ret <2 x i64> [[TMP3]]
 uint64x2_t test_vpadalq_u32(uint64x2_t a, uint32x4_t b) {
   return vpadalq_u32(a, b);
 }
 
 
-// CHECK-LABEL: test_vpadd_s8
-// CHECK: vpadd.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vpadd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VPADD_V_I]]
 int8x8_t test_vpadd_s8(int8x8_t a, int8x8_t b) {
   return vpadd_s8(a, b);
 }
 
-// CHECK-LABEL: test_vpadd_s16
-// CHECK: vpadd.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vpadd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> [[VPADD_V_I]], <4 x i16> [[VPADD_V1_I]]) #4
+// CHECK:   [[VPADD_V3_I:%.*]] = bitcast <4 x i16> [[VPADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vpadd_s16(int16x4_t a, int16x4_t b) {
   return vpadd_s16(a, b);
 }
 
-// CHECK-LABEL: test_vpadd_s32
-// CHECK: vpadd.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vpadd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> [[VPADD_V_I]], <2 x i32> [[VPADD_V1_I]]) #4
+// CHECK:   [[VPADD_V3_I:%.*]] = bitcast <2 x i32> [[VPADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vpadd_s32(int32x2_t a, int32x2_t b) {
   return vpadd_s32(a, b);
 }
 
-// CHECK-LABEL: test_vpadd_u8
-// CHECK: vpadd.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vpadd_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VPADD_V_I]]
 uint8x8_t test_vpadd_u8(uint8x8_t a, uint8x8_t b) {
   return vpadd_u8(a, b);
 }
 
-// CHECK-LABEL: test_vpadd_u16
-// CHECK: vpadd.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vpadd_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> [[VPADD_V_I]], <4 x i16> [[VPADD_V1_I]]) #4
+// CHECK:   [[VPADD_V3_I:%.*]] = bitcast <4 x i16> [[VPADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vpadd_u16(uint16x4_t a, uint16x4_t b) {
   return vpadd_u16(a, b);
 }
 
-// CHECK-LABEL: test_vpadd_u32
-// CHECK: vpadd.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vpadd_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> [[VPADD_V_I]], <2 x i32> [[VPADD_V1_I]]) #4
+// CHECK:   [[VPADD_V3_I:%.*]] = bitcast <2 x i32> [[VPADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vpadd_u32(uint32x2_t a, uint32x2_t b) {
   return vpadd_u32(a, b);
 }
 
-// CHECK-LABEL: test_vpadd_f32
-// CHECK: vpadd.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vpadd_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VPADD_V2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float> [[VPADD_V_I]], <2 x float> [[VPADD_V1_I]]) #4
+// CHECK:   [[VPADD_V3_I:%.*]] = bitcast <2 x float> [[VPADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <2 x float>
+// CHECK:   ret <2 x float> [[TMP2]]
 float32x2_t test_vpadd_f32(float32x2_t a, float32x2_t b) {
   return vpadd_f32(a, b);
 }
 
 
-// CHECK-LABEL: test_vpaddl_s8
-// CHECK: vpaddl.s8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vpaddl_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VPADDL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %a) #4
+// CHECK:   ret <4 x i16> [[VPADDL_I]]
 int16x4_t test_vpaddl_s8(int8x8_t a) {
   return vpaddl_s8(a);
 }
 
-// CHECK-LABEL: test_vpaddl_s16
-// CHECK: vpaddl.s16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vpaddl_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> [[VPADDL_I]]) #4
+// CHECK:   ret <2 x i32> [[VPADDL1_I]]
 int32x2_t test_vpaddl_s16(int16x4_t a) {
   return vpaddl_s16(a);
 }
 
-// CHECK-LABEL: test_vpaddl_s32
-// CHECK: vpaddl.s32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vpaddl_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> [[VPADDL_I]]) #4
+// CHECK:   ret <1 x i64> [[VPADDL1_I]]
 int64x1_t test_vpaddl_s32(int32x2_t a) {
   return vpaddl_s32(a);
 }
 
-// CHECK-LABEL: test_vpaddl_u8
-// CHECK: vpaddl.u8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vpaddl_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VPADDL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %a) #4
+// CHECK:   ret <4 x i16> [[VPADDL_I]]
 uint16x4_t test_vpaddl_u8(uint8x8_t a) {
   return vpaddl_u8(a);
 }
 
-// CHECK-LABEL: test_vpaddl_u16
-// CHECK: vpaddl.u16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vpaddl_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> [[VPADDL_I]]) #4
+// CHECK:   ret <2 x i32> [[VPADDL1_I]]
 uint32x2_t test_vpaddl_u16(uint16x4_t a) {
   return vpaddl_u16(a);
 }
 
-// CHECK-LABEL: test_vpaddl_u32
-// CHECK: vpaddl.u32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vpaddl_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> [[VPADDL_I]]) #4
+// CHECK:   ret <1 x i64> [[VPADDL1_I]]
 uint64x1_t test_vpaddl_u32(uint32x2_t a) {
   return vpaddl_u32(a);
 }
 
-// CHECK-LABEL: test_vpaddlq_s8
-// CHECK: vpaddl.s8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vpaddlq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VPADDL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %a) #4
+// CHECK:   ret <8 x i16> [[VPADDL_I]]
 int16x8_t test_vpaddlq_s8(int8x16_t a) {
   return vpaddlq_s8(a);
 }
 
-// CHECK-LABEL: test_vpaddlq_s16
-// CHECK: vpaddl.s16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vpaddlq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> [[VPADDL_I]]) #4
+// CHECK:   ret <4 x i32> [[VPADDL1_I]]
 int32x4_t test_vpaddlq_s16(int16x8_t a) {
   return vpaddlq_s16(a);
 }
 
-// CHECK-LABEL: test_vpaddlq_s32
-// CHECK: vpaddl.s32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vpaddlq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> [[VPADDL_I]]) #4
+// CHECK:   ret <2 x i64> [[VPADDL1_I]]
 int64x2_t test_vpaddlq_s32(int32x4_t a) {
   return vpaddlq_s32(a);
 }
 
-// CHECK-LABEL: test_vpaddlq_u8
-// CHECK: vpaddl.u8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vpaddlq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VPADDL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %a) #4
+// CHECK:   ret <8 x i16> [[VPADDL_I]]
 uint16x8_t test_vpaddlq_u8(uint8x16_t a) {
   return vpaddlq_u8(a);
 }
 
-// CHECK-LABEL: test_vpaddlq_u16
-// CHECK: vpaddl.u16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vpaddlq_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> [[VPADDL_I]]) #4
+// CHECK:   ret <4 x i32> [[VPADDL1_I]]
 uint32x4_t test_vpaddlq_u16(uint16x8_t a) {
   return vpaddlq_u16(a);
 }
 
-// CHECK-LABEL: test_vpaddlq_u32
-// CHECK: vpaddl.u32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vpaddlq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> [[VPADDL_I]]) #4
+// CHECK:   ret <2 x i64> [[VPADDL1_I]]
 uint64x2_t test_vpaddlq_u32(uint32x4_t a) {
   return vpaddlq_u32(a);
 }
 
 
-// CHECK-LABEL: test_vpmax_s8
-// CHECK: vpmax.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vpmax_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VPMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.smaxp.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VPMAX_I]]
 int8x8_t test_vpmax_s8(int8x8_t a, int8x8_t b) {
   return vpmax_s8(a, b);
 }
 
-// CHECK-LABEL: test_vpmax_s16
-// CHECK: vpmax.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vpmax_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VPMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VPMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.smaxp.v4i16(<4 x i16> [[VPMAX_I]], <4 x i16> [[VPMAX1_I]]) #4
+// CHECK:   ret <4 x i16> [[VPMAX2_I]]
 int16x4_t test_vpmax_s16(int16x4_t a, int16x4_t b) {
   return vpmax_s16(a, b);
 }
 
-// CHECK-LABEL: test_vpmax_s32
-// CHECK: vpmax.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vpmax_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VPMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VPMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.smaxp.v2i32(<2 x i32> [[VPMAX_I]], <2 x i32> [[VPMAX1_I]]) #4
+// CHECK:   ret <2 x i32> [[VPMAX2_I]]
 int32x2_t test_vpmax_s32(int32x2_t a, int32x2_t b) {
   return vpmax_s32(a, b);
 }
 
-// CHECK-LABEL: test_vpmax_u8
-// CHECK: vpmax.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vpmax_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VPMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.umaxp.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VPMAX_I]]
 uint8x8_t test_vpmax_u8(uint8x8_t a, uint8x8_t b) {
   return vpmax_u8(a, b);
 }
 
-// CHECK-LABEL: test_vpmax_u16
-// CHECK: vpmax.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vpmax_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VPMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VPMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.umaxp.v4i16(<4 x i16> [[VPMAX_I]], <4 x i16> [[VPMAX1_I]]) #4
+// CHECK:   ret <4 x i16> [[VPMAX2_I]]
 uint16x4_t test_vpmax_u16(uint16x4_t a, uint16x4_t b) {
   return vpmax_u16(a, b);
 }
 
-// CHECK-LABEL: test_vpmax_u32
-// CHECK: vpmax.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vpmax_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VPMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VPMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.umaxp.v2i32(<2 x i32> [[VPMAX_I]], <2 x i32> [[VPMAX1_I]]) #4
+// CHECK:   ret <2 x i32> [[VPMAX2_I]]
 uint32x2_t test_vpmax_u32(uint32x2_t a, uint32x2_t b) {
   return vpmax_u32(a, b);
 }
 
-// CHECK-LABEL: test_vpmax_f32
-// CHECK: vpmax.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vpmax_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VPMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VPMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VPMAX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmaxp.v2f32(<2 x float> [[VPMAX_I]], <2 x float> [[VPMAX1_I]]) #4
+// CHECK:   ret <2 x float> [[VPMAX2_I]]
 float32x2_t test_vpmax_f32(float32x2_t a, float32x2_t b) {
   return vpmax_f32(a, b);
 }
 
 
-// CHECK-LABEL: test_vpmin_s8
-// CHECK: vpmin.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vpmin_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VPMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sminp.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VPMIN_I]]
 int8x8_t test_vpmin_s8(int8x8_t a, int8x8_t b) {
   return vpmin_s8(a, b);
 }
 
-// CHECK-LABEL: test_vpmin_s16
-// CHECK: vpmin.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vpmin_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VPMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VPMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sminp.v4i16(<4 x i16> [[VPMIN_I]], <4 x i16> [[VPMIN1_I]]) #4
+// CHECK:   ret <4 x i16> [[VPMIN2_I]]
 int16x4_t test_vpmin_s16(int16x4_t a, int16x4_t b) {
   return vpmin_s16(a, b);
 }
 
-// CHECK-LABEL: test_vpmin_s32
-// CHECK: vpmin.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vpmin_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VPMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VPMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sminp.v2i32(<2 x i32> [[VPMIN_I]], <2 x i32> [[VPMIN1_I]]) #4
+// CHECK:   ret <2 x i32> [[VPMIN2_I]]
 int32x2_t test_vpmin_s32(int32x2_t a, int32x2_t b) {
   return vpmin_s32(a, b);
 }
 
-// CHECK-LABEL: test_vpmin_u8
-// CHECK: vpmin.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vpmin_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VPMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uminp.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VPMIN_I]]
 uint8x8_t test_vpmin_u8(uint8x8_t a, uint8x8_t b) {
   return vpmin_u8(a, b);
 }
 
-// CHECK-LABEL: test_vpmin_u16
-// CHECK: vpmin.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vpmin_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VPMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VPMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uminp.v4i16(<4 x i16> [[VPMIN_I]], <4 x i16> [[VPMIN1_I]]) #4
+// CHECK:   ret <4 x i16> [[VPMIN2_I]]
 uint16x4_t test_vpmin_u16(uint16x4_t a, uint16x4_t b) {
   return vpmin_u16(a, b);
 }
 
-// CHECK-LABEL: test_vpmin_u32
-// CHECK: vpmin.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vpmin_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VPMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VPMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uminp.v2i32(<2 x i32> [[VPMIN_I]], <2 x i32> [[VPMIN1_I]]) #4
+// CHECK:   ret <2 x i32> [[VPMIN2_I]]
 uint32x2_t test_vpmin_u32(uint32x2_t a, uint32x2_t b) {
   return vpmin_u32(a, b);
 }
 
-// CHECK-LABEL: test_vpmin_f32
-// CHECK: vpmin.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vpmin_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VPMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VPMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VPMIN2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fminp.v2f32(<2 x float> [[VPMIN_I]], <2 x float> [[VPMIN1_I]]) #4
+// CHECK:   ret <2 x float> [[VPMIN2_I]]
 float32x2_t test_vpmin_f32(float32x2_t a, float32x2_t b) {
   return vpmin_f32(a, b);
 }
 
 
-// CHECK-LABEL: test_vqabs_s8
-// CHECK: vqabs.s8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqabs_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VQABS_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8> %a) #4
+// CHECK:   ret <8 x i8> [[VQABS_V_I]]
 int8x8_t test_vqabs_s8(int8x8_t a) {
   return vqabs_s8(a);
 }
 
-// CHECK-LABEL: test_vqabs_s16
-// CHECK: vqabs.s16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqabs_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VQABS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQABS_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16> [[VQABS_V_I]]) #4
+// CHECK:   [[VQABS_V2_I:%.*]] = bitcast <4 x i16> [[VQABS_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQABS_V2_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP1]]
 int16x4_t test_vqabs_s16(int16x4_t a) {
   return vqabs_s16(a);
 }
 
-// CHECK-LABEL: test_vqabs_s32
-// CHECK: vqabs.s32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqabs_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VQABS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQABS_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32> [[VQABS_V_I]]) #4
+// CHECK:   [[VQABS_V2_I:%.*]] = bitcast <2 x i32> [[VQABS_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQABS_V2_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP1]]
 int32x2_t test_vqabs_s32(int32x2_t a) {
   return vqabs_s32(a);
 }
 
-// CHECK-LABEL: test_vqabsq_s8
-// CHECK: vqabs.s8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vqabsq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VQABSQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8> %a) #4
+// CHECK:   ret <16 x i8> [[VQABSQ_V_I]]
 int8x16_t test_vqabsq_s8(int8x16_t a) {
   return vqabsq_s8(a);
 }
 
-// CHECK-LABEL: test_vqabsq_s16
-// CHECK: vqabs.s16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vqabsq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQABSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQABSQ_V1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16> [[VQABSQ_V_I]]) #4
+// CHECK:   [[VQABSQ_V2_I:%.*]] = bitcast <8 x i16> [[VQABSQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VQABSQ_V2_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP1]]
 int16x8_t test_vqabsq_s16(int16x8_t a) {
   return vqabsq_s16(a);
 }
 
-// CHECK-LABEL: test_vqabsq_s32
-// CHECK: vqabs.s32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqabsq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQABSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQABSQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32> [[VQABSQ_V_I]]) #4
+// CHECK:   [[VQABSQ_V2_I:%.*]] = bitcast <4 x i32> [[VQABSQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VQABSQ_V2_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP1]]
 int32x4_t test_vqabsq_s32(int32x4_t a) {
   return vqabsq_s32(a);
 }
 
 
-// CHECK-LABEL: test_vqadd_s8
-// CHECK: vqadd.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqadd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQADD_V_I]]
 int8x8_t test_vqadd_s8(int8x8_t a, int8x8_t b) {
   return vqadd_s8(a, b);
 }
 
-// CHECK-LABEL: test_vqadd_s16
-// CHECK: vqadd.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqadd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> [[VQADD_V_I]], <4 x i16> [[VQADD_V1_I]]) #4
+// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqadd_s16(int16x4_t a, int16x4_t b) {
   return vqadd_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqadd_s32
-// CHECK: vqadd.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqadd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> [[VQADD_V_I]], <2 x i32> [[VQADD_V1_I]]) #4
+// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqadd_s32(int32x2_t a, int32x2_t b) {
   return vqadd_s32(a, b);
 }
 
-// CHECK-LABEL: test_vqadd_s64
-// CHECK: vqadd.s64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vqadd_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqadd.v1i64(<1 x i64> [[VQADD_V_I]], <1 x i64> [[VQADD_V1_I]]) #4
+// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 int64x1_t test_vqadd_s64(int64x1_t a, int64x1_t b) {
   return vqadd_s64(a, b);
 }
 
-// CHECK-LABEL: test_vqadd_u8
-// CHECK: vqadd.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqadd_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQADD_V_I]]
 uint8x8_t test_vqadd_u8(uint8x8_t a, uint8x8_t b) {
   return vqadd_u8(a, b);
 }
 
-// CHECK-LABEL: test_vqadd_u16
-// CHECK: vqadd.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqadd_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqadd.v4i16(<4 x i16> [[VQADD_V_I]], <4 x i16> [[VQADD_V1_I]]) #4
+// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vqadd_u16(uint16x4_t a, uint16x4_t b) {
   return vqadd_u16(a, b);
 }
 
-// CHECK-LABEL: test_vqadd_u32
-// CHECK: vqadd.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqadd_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqadd.v2i32(<2 x i32> [[VQADD_V_I]], <2 x i32> [[VQADD_V1_I]]) #4
+// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vqadd_u32(uint32x2_t a, uint32x2_t b) {
   return vqadd_u32(a, b);
 }
 
-// CHECK-LABEL: test_vqadd_u64
-// CHECK: vqadd.u64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vqadd_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqadd.v1i64(<1 x i64> [[VQADD_V_I]], <1 x i64> [[VQADD_V1_I]]) #4
+// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 uint64x1_t test_vqadd_u64(uint64x1_t a, uint64x1_t b) {
   return vqadd_u64(a, b);
 }
 
-// CHECK-LABEL: test_vqaddq_s8
-// CHECK: vqadd.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vqaddq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqadd.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQADDQ_V_I]]
 int8x16_t test_vqaddq_s8(int8x16_t a, int8x16_t b) {
   return vqaddq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vqaddq_s16
-// CHECK: vqadd.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vqaddq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> [[VQADDQ_V_I]], <8 x i16> [[VQADDQ_V1_I]]) #4
+// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqaddq_s16(int16x8_t a, int16x8_t b) {
   return vqaddq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqaddq_s32
-// CHECK: vqadd.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqaddq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQADDQ_V_I]], <4 x i32> [[VQADDQ_V1_I]]) #4
+// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqaddq_s32(int32x4_t a, int32x4_t b) {
   return vqaddq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vqaddq_s64
-// CHECK: vqadd.s64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vqaddq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQADDQ_V_I]], <2 x i64> [[VQADDQ_V1_I]]) #4
+// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqaddq_s64(int64x2_t a, int64x2_t b) {
   return vqaddq_s64(a, b);
 }
 
-// CHECK-LABEL: test_vqaddq_u8
-// CHECK: vqadd.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vqaddq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqadd.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQADDQ_V_I]]
 uint8x16_t test_vqaddq_u8(uint8x16_t a, uint8x16_t b) {
   return vqaddq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vqaddq_u16
-// CHECK: vqadd.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vqaddq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqadd.v8i16(<8 x i16> [[VQADDQ_V_I]], <8 x i16> [[VQADDQ_V1_I]]) #4
+// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vqaddq_u16(uint16x8_t a, uint16x8_t b) {
   return vqaddq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vqaddq_u32
-// CHECK: vqadd.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqaddq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqadd.v4i32(<4 x i32> [[VQADDQ_V_I]], <4 x i32> [[VQADDQ_V1_I]]) #4
+// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vqaddq_u32(uint32x4_t a, uint32x4_t b) {
   return vqaddq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vqaddq_u64
-// CHECK: vqadd.u64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vqaddq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqadd.v2i64(<2 x i64> [[VQADDQ_V_I]], <2 x i64> [[VQADDQ_V1_I]]) #4
+// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 uint64x2_t test_vqaddq_u64(uint64x2_t a, uint64x2_t b) {
   return vqaddq_u64(a, b);
 }
 
 
-// CHECK-LABEL: test_vqdmlal_s16
-// CHECK: vqdmlal.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqdmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #4
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #4
+// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vqdmlal_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vqdmlal_s32
-// CHECK: vqdmlal.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vqdmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #4
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #4
+// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vqdmlal_s32(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vqdmlal_lane_s16
-// CHECK: vqdmlal.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vqdmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #4
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #4
+// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vqdmlal_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vqdmlal_lane_s32
-// CHECK: vqdmlal.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i64> @test_vqdmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #4
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #4
+// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vqdmlal_lane_s32(a, b, c, 1);
 }
 
 
-// CHECK-LABEL: test_vqdmlal_n_s16
-// CHECK: vqdmlal.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqdmlal_n_s16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL4_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL4_I]]) #4
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLAL_V6_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL5_I]]) #4
+// CHECK:   ret <4 x i32> [[VQDMLAL_V6_I]]
 int32x4_t test_vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
   return vqdmlal_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vqdmlal_n_s32
-// CHECK: vqdmlal.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vqdmlal_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL2_I]]) #4
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLAL_V4_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL3_I]]) #4
+// CHECK:   ret <2 x i64> [[VQDMLAL_V4_I]]
 int64x2_t test_vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
   return vqdmlal_n_s32(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vqdmlsl_s16
-// CHECK: vqdmlsl.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #4
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #4
+// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vqdmlsl_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vqdmlsl_s32
-// CHECK: vqdmlsl.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #4
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #4
+// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vqdmlsl_s32(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vqdmlsl_lane_s16
-// CHECK: vqdmlsl.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #4
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #4
+// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vqdmlsl_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vqdmlsl_lane_s32
-// CHECK: vqdmlsl.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #4
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #4
+// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vqdmlsl_lane_s32(a, b, c, 1);
 }
 
 
-// CHECK-LABEL: test_vqdmlsl_n_s16
-// CHECK: vqdmlsl.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_n_s16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL4_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL4_I]]) #4
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLSL_V6_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL5_I]]) #4
+// CHECK:   ret <4 x i32> [[VQDMLSL_V6_I]]
 int32x4_t test_vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
   return vqdmlsl_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vqdmlsl_n_s32
-// CHECK: vqdmlsl.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL2_I]]) #4
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLSL_V4_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL3_I]]) #4
+// CHECK:   ret <2 x i64> [[VQDMLSL_V4_I]]
 int64x2_t test_vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
   return vqdmlsl_n_s32(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vqdmulh_s16
-// CHECK: vqdmulh.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqdmulh_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #4
+// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqdmulh_s16(int16x4_t a, int16x4_t b) {
   return vqdmulh_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqdmulh_s32
-// CHECK: vqdmulh.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqdmulh_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #4
+// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqdmulh_s32(int32x2_t a, int32x2_t b) {
   return vqdmulh_s32(a, b);
 }
 
-// CHECK-LABEL: test_vqdmulhq_s16
-// CHECK: vqdmulh.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #4
+// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqdmulhq_s16(int16x8_t a, int16x8_t b) {
   return vqdmulhq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqdmulhq_s32
-// CHECK: vqdmulh.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #4
+// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmulhq_s32(int32x4_t a, int32x4_t b) {
   return vqdmulhq_s32(a, b);
 }
 
 
-// CHECK-LABEL: test_vqdmulh_lane_s16
-// CHECK: vqdmulh.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i16> @test_vqdmulh_lane_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #4
+// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t b) {
   return vqdmulh_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vqdmulh_lane_s32
-// CHECK: vqdmulh.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i32> @test_vqdmulh_lane_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #4
+// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t b) {
   return vqdmulh_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vqdmulhq_lane_s16
-// CHECK: vqdmulh.s16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #4
+// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t b) {
   return vqdmulhq_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vqdmulhq_lane_s32
-// CHECK: vqdmulh.s32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #4
+// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t b) {
   return vqdmulhq_lane_s32(a, b, 1);
 }
 
 
-// CHECK-LABEL: test_vqdmulh_n_s16
-// CHECK: vqdmulh.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqdmulh_n_s16(<4 x i16> %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULH_V4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULH_V5_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V4_I]]) #4
+// CHECK:   [[VQDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V5_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V6_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqdmulh_n_s16(int16x4_t a, int16_t b) {
   return vqdmulh_n_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqdmulh_n_s32
-// CHECK: vqdmulh.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqdmulh_n_s32(<2 x i32> %a, i32 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V2_I]]) #4
+// CHECK:   [[VQDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V3_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V4_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqdmulh_n_s32(int32x2_t a, int32_t b) {
   return vqdmulh_n_s32(a, b);
 }
 
-// CHECK-LABEL: test_vqdmulhq_n_s16
-// CHECK: vqdmulh.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_n_s16(<8 x i16> %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQDMULHQ_V8_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V8_I]]) #4
+// CHECK:   [[VQDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V9_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V10_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqdmulhq_n_s16(int16x8_t a, int16_t b) {
   return vqdmulhq_n_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqdmulhq_n_s32
-// CHECK: vqdmulh.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_n_s32(<4 x i32> %a, i32 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMULHQ_V4_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V4_I]]) #4
+// CHECK:   [[VQDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V5_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V6_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmulhq_n_s32(int32x4_t a, int32_t b) {
   return vqdmulhq_n_s32(a, b);
 }
 
 
-// CHECK-LABEL: test_vqdmull_s16
-// CHECK: vqdmull.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqdmull_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #4
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmull_s16(int16x4_t a, int16x4_t b) {
   return vqdmull_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqdmull_s32
-// CHECK: vqdmull.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vqdmull_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #4
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqdmull_s32(int32x2_t a, int32x2_t b) {
   return vqdmull_s32(a, b);
 }
 
 
-// CHECK-LABEL: test_vqdmull_lane_s16
-// CHECK: vqdmull.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vqdmull_lane_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #4
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t b) {
   return vqdmull_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vqdmull_lane_s32
-// CHECK: vqdmull.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i64> @test_vqdmull_lane_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #4
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t b) {
   return vqdmull_lane_s32(a, b, 1);
 }
 
 
-// CHECK-LABEL: test_vqdmull_n_s16
-// CHECK: vqdmull.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqdmull_n_s16(<4 x i16> %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULL_V4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULL_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V4_I]]) #4
+// CHECK:   [[VQDMULL_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V5_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V6_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmull_n_s16(int16x4_t a, int16_t b) {
   return vqdmull_n_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqdmull_n_s32
-// CHECK: vqdmull.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vqdmull_n_s32(<2 x i32> %a, i32 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V2_I]]) #4
+// CHECK:   [[VQDMULL_V4_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V3_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V4_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqdmull_n_s32(int32x2_t a, int32_t b) {
   return vqdmull_n_s32(a, b);
 }
 
 
-// CHECK-LABEL: test_vqmovn_s16
-// CHECK: vqmovn.s16 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqmovn_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> [[VQMOVN_V_I]]) #4
+// CHECK:   ret <8 x i8> [[VQMOVN_V1_I]]
 int8x8_t test_vqmovn_s16(int16x8_t a) {
   return vqmovn_s16(a);
 }
 
-// CHECK-LABEL: test_vqmovn_s32
-// CHECK: vqmovn.s32 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqmovn_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> [[VQMOVN_V_I]]) #4
+// CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP1]]
 int16x4_t test_vqmovn_s32(int32x4_t a) {
   return vqmovn_s32(a);
 }
 
-// CHECK-LABEL: test_vqmovn_s64
-// CHECK: vqmovn.s64 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqmovn_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64> [[VQMOVN_V_I]]) #4
+// CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP1]]
 int32x2_t test_vqmovn_s64(int64x2_t a) {
   return vqmovn_s64(a);
 }
 
-// CHECK-LABEL: test_vqmovn_u16
-// CHECK: vqmovn.u16 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqmovn_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> [[VQMOVN_V_I]]) #4
+// CHECK:   ret <8 x i8> [[VQMOVN_V1_I]]
 uint8x8_t test_vqmovn_u16(uint16x8_t a) {
   return vqmovn_u16(a);
 }
 
-// CHECK-LABEL: test_vqmovn_u32
-// CHECK: vqmovn.u32 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqmovn_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> [[VQMOVN_V_I]]) #4
+// CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP1]]
 uint16x4_t test_vqmovn_u32(uint32x4_t a) {
   return vqmovn_u32(a);
 }
 
-// CHECK-LABEL: test_vqmovn_u64
-// CHECK: vqmovn.u64 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqmovn_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64> [[VQMOVN_V_I]]) #4
+// CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP1]]
 uint32x2_t test_vqmovn_u64(uint64x2_t a) {
   return vqmovn_u64(a);
 }
 
 
-// CHECK-LABEL: test_vqmovun_s16
-// CHECK: vqmovun.s16 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqmovun_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQMOVUN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQMOVUN_V1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> [[VQMOVUN_V_I]]) #4
+// CHECK:   ret <8 x i8> [[VQMOVUN_V1_I]]
 uint8x8_t test_vqmovun_s16(int16x8_t a) {
   return vqmovun_s16(a);
 }
 
-// CHECK-LABEL: test_vqmovun_s32
-// CHECK: vqmovun.s32 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqmovun_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQMOVUN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQMOVUN_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> [[VQMOVUN_V_I]]) #4
+// CHECK:   [[VQMOVUN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVUN_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVUN_V2_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP1]]
 uint16x4_t test_vqmovun_s32(int32x4_t a) {
   return vqmovun_s32(a);
 }
 
-// CHECK-LABEL: test_vqmovun_s64
-// CHECK: vqmovun.s64 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqmovun_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQMOVUN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQMOVUN_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64> [[VQMOVUN_V_I]]) #4
+// CHECK:   [[VQMOVUN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVUN_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVUN_V2_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP1]]
 uint32x2_t test_vqmovun_s64(int64x2_t a) {
   return vqmovun_s64(a);
 }
 
 
-// CHECK-LABEL: test_vqneg_s8
-// CHECK: vqneg.s8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqneg_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VQNEG_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8> %a) #4
+// CHECK:   ret <8 x i8> [[VQNEG_V_I]]
 int8x8_t test_vqneg_s8(int8x8_t a) {
   return vqneg_s8(a);
 }
 
-// CHECK-LABEL: test_vqneg_s16
-// CHECK: vqneg.s16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqneg_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VQNEG_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQNEG_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16> [[VQNEG_V_I]]) #4
+// CHECK:   [[VQNEG_V2_I:%.*]] = bitcast <4 x i16> [[VQNEG_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQNEG_V2_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP1]]
 int16x4_t test_vqneg_s16(int16x4_t a) {
   return vqneg_s16(a);
 }
 
-// CHECK-LABEL: test_vqneg_s32
-// CHECK: vqneg.s32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqneg_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VQNEG_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQNEG_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32> [[VQNEG_V_I]]) #4
+// CHECK:   [[VQNEG_V2_I:%.*]] = bitcast <2 x i32> [[VQNEG_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQNEG_V2_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP1]]
 int32x2_t test_vqneg_s32(int32x2_t a) {
   return vqneg_s32(a);
 }
 
-// CHECK-LABEL: test_vqnegq_s8
-// CHECK: vqneg.s8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vqnegq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VQNEGQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8> %a) #4
+// CHECK:   ret <16 x i8> [[VQNEGQ_V_I]]
 int8x16_t test_vqnegq_s8(int8x16_t a) {
   return vqnegq_s8(a);
 }
 
-// CHECK-LABEL: test_vqnegq_s16
-// CHECK: vqneg.s16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vqnegq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQNEGQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQNEGQ_V1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> [[VQNEGQ_V_I]]) #4
+// CHECK:   [[VQNEGQ_V2_I:%.*]] = bitcast <8 x i16> [[VQNEGQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VQNEGQ_V2_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP1]]
 int16x8_t test_vqnegq_s16(int16x8_t a) {
   return vqnegq_s16(a);
 }
 
-// CHECK-LABEL: test_vqnegq_s32
-// CHECK: vqneg.s32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqnegq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQNEGQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQNEGQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32> [[VQNEGQ_V_I]]) #4
+// CHECK:   [[VQNEGQ_V2_I:%.*]] = bitcast <4 x i32> [[VQNEGQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VQNEGQ_V2_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP1]]
 int32x4_t test_vqnegq_s32(int32x4_t a) {
   return vqnegq_s32(a);
 }
 
 
-// CHECK-LABEL: test_vqrdmulh_s16
-// CHECK: vqrdmulh.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #4
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqrdmulh_s16(int16x4_t a, int16x4_t b) {
   return vqrdmulh_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqrdmulh_s32
-// CHECK: vqrdmulh.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #4
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqrdmulh_s32(int32x2_t a, int32x2_t b) {
   return vqrdmulh_s32(a, b);
 }
 
-// CHECK-LABEL: test_vqrdmulhq_s16
-// CHECK: vqrdmulh.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #4
+// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqrdmulhq_s16(int16x8_t a, int16x8_t b) {
   return vqrdmulhq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqrdmulhq_s32
-// CHECK: vqrdmulh.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #4
+// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqrdmulhq_s32(int32x4_t a, int32x4_t b) {
   return vqrdmulhq_s32(a, b);
 }
 
 
-// CHECK-LABEL: test_vqrdmulh_lane_s16
-// CHECK: vqrdmulh.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_lane_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #4
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t b) {
   return vqrdmulh_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vqrdmulh_lane_s32
-// CHECK: vqrdmulh.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_lane_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #4
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t b) {
   return vqrdmulh_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vqrdmulhq_lane_s16
-// CHECK: vqrdmulh.s16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #4
+// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t b) {
   return vqrdmulhq_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vqrdmulhq_lane_s32
-// CHECK: vqrdmulh.s32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #4
+// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t b) {
   return vqrdmulhq_lane_s32(a, b, 1);
 }
 
 
-// CHECK-LABEL: test_vqrdmulh_n_s16
-// CHECK: vqrdmulh.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_n_s16(<4 x i16> %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQRDMULH_V4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQRDMULH_V5_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V4_I]]) #4
+// CHECK:   [[VQRDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V5_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V6_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqrdmulh_n_s16(int16x4_t a, int16_t b) {
   return vqrdmulh_n_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqrdmulh_n_s32
-// CHECK: vqrdmulh.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_n_s32(<2 x i32> %a, i32 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V2_I]]) #4
+// CHECK:   [[VQRDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V3_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V4_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqrdmulh_n_s32(int32x2_t a, int32_t b) {
   return vqrdmulh_n_s32(a, b);
 }
 
-// CHECK-LABEL: test_vqrdmulhq_n_s16
-// CHECK: vqrdmulh.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_n_s16(<8 x i16> %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRDMULHQ_V8_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQRDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V8_I]]) #4
+// CHECK:   [[VQRDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V9_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V10_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqrdmulhq_n_s16(int16x8_t a, int16_t b) {
   return vqrdmulhq_n_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqrdmulhq_n_s32
-// CHECK: vqrdmulh.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_n_s32(<4 x i32> %a, i32 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRDMULHQ_V4_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQRDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V4_I]]) #4
+// CHECK:   [[VQRDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V5_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V6_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqrdmulhq_n_s32(int32x4_t a, int32_t b) {
   return vqrdmulhq_n_s32(a, b);
 }
 
 
-// CHECK-LABEL: test_vqrshl_s8
-// CHECK: vqrshl.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqrshl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQRSHL_V_I]]
 int8x8_t test_vqrshl_s8(int8x8_t a, int8x8_t b) {
   return vqrshl_s8(a, b);
 }
 
-// CHECK-LABEL: test_vqrshl_s16
-// CHECK: vqrshl.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqrshl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshl.v4i16(<4 x i16> [[VQRSHL_V_I]], <4 x i16> [[VQRSHL_V1_I]]) #4
+// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqrshl_s16(int16x4_t a, int16x4_t b) {
   return vqrshl_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqrshl_s32
-// CHECK: vqrshl.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqrshl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshl.v2i32(<2 x i32> [[VQRSHL_V_I]], <2 x i32> [[VQRSHL_V1_I]]) #4
+// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqrshl_s32(int32x2_t a, int32x2_t b) {
   return vqrshl_s32(a, b);
 }
 
-// CHECK-LABEL: test_vqrshl_s64
-// CHECK: vqrshl.s64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vqrshl_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqrshl.v1i64(<1 x i64> [[VQRSHL_V_I]], <1 x i64> [[VQRSHL_V1_I]]) #4
+// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 int64x1_t test_vqrshl_s64(int64x1_t a, int64x1_t b) {
   return vqrshl_s64(a, b);
 }
 
-// CHECK-LABEL: test_vqrshl_u8
-// CHECK: vqrshl.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqrshl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQRSHL_V_I]]
 uint8x8_t test_vqrshl_u8(uint8x8_t a, int8x8_t b) {
   return vqrshl_u8(a, b);
 }
 
-// CHECK-LABEL: test_vqrshl_u16
-// CHECK: vqrshl.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqrshl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshl.v4i16(<4 x i16> [[VQRSHL_V_I]], <4 x i16> [[VQRSHL_V1_I]]) #4
+// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vqrshl_u16(uint16x4_t a, int16x4_t b) {
   return vqrshl_u16(a, b);
 }
 
-// CHECK-LABEL: test_vqrshl_u32
-// CHECK: vqrshl.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqrshl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshl.v2i32(<2 x i32> [[VQRSHL_V_I]], <2 x i32> [[VQRSHL_V1_I]]) #4
+// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vqrshl_u32(uint32x2_t a, int32x2_t b) {
   return vqrshl_u32(a, b);
 }
 
-// CHECK-LABEL: test_vqrshl_u64
-// CHECK: vqrshl.u64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vqrshl_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqrshl.v1i64(<1 x i64> [[VQRSHL_V_I]], <1 x i64> [[VQRSHL_V1_I]]) #4
+// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 uint64x1_t test_vqrshl_u64(uint64x1_t a, int64x1_t b) {
   return vqrshl_u64(a, b);
 }
 
-// CHECK-LABEL: test_vqrshlq_s8
-// CHECK: vqrshl.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vqrshlq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqrshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQRSHLQ_V_I]]
 int8x16_t test_vqrshlq_s8(int8x16_t a, int8x16_t b) {
   return vqrshlq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vqrshlq_s16
-// CHECK: vqrshl.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vqrshlq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrshl.v8i16(<8 x i16> [[VQRSHLQ_V_I]], <8 x i16> [[VQRSHLQ_V1_I]]) #4
+// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqrshlq_s16(int16x8_t a, int16x8_t b) {
   return vqrshlq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqrshlq_s32
-// CHECK: vqrshl.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqrshlq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrshl.v4i32(<4 x i32> [[VQRSHLQ_V_I]], <4 x i32> [[VQRSHLQ_V1_I]]) #4
+// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqrshlq_s32(int32x4_t a, int32x4_t b) {
   return vqrshlq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vqrshlq_s64
-// CHECK: vqrshl.s64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vqrshlq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqrshl.v2i64(<2 x i64> [[VQRSHLQ_V_I]], <2 x i64> [[VQRSHLQ_V1_I]]) #4
+// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqrshlq_s64(int64x2_t a, int64x2_t b) {
   return vqrshlq_s64(a, b);
 }
 
-// CHECK-LABEL: test_vqrshlq_u8
-// CHECK: vqrshl.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vqrshlq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqrshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQRSHLQ_V_I]]
 uint8x16_t test_vqrshlq_u8(uint8x16_t a, int8x16_t b) {
   return vqrshlq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vqrshlq_u16
-// CHECK: vqrshl.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vqrshlq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqrshl.v8i16(<8 x i16> [[VQRSHLQ_V_I]], <8 x i16> [[VQRSHLQ_V1_I]]) #4
+// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vqrshlq_u16(uint16x8_t a, int16x8_t b) {
   return vqrshlq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vqrshlq_u32
-// CHECK: vqrshl.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqrshlq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqrshl.v4i32(<4 x i32> [[VQRSHLQ_V_I]], <4 x i32> [[VQRSHLQ_V1_I]]) #4
+// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vqrshlq_u32(uint32x4_t a, int32x4_t b) {
   return vqrshlq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vqrshlq_u64
-// CHECK: vqrshl.u64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vqrshlq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqrshl.v2i64(<2 x i64> [[VQRSHLQ_V_I]], <2 x i64> [[VQRSHLQ_V1_I]]) #4
+// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 uint64x2_t test_vqrshlq_u64(uint64x2_t a, int64x2_t b) {
   return vqrshlq_u64(a, b);
 }
 
 
-// CHECK-LABEL: test_vqrshrn_n_s16
-// CHECK: vqrshrn.s16 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqrshrn_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 1)
+// CHECK:   ret <8 x i8> [[VQRSHRN_N]]1
 int8x8_t test_vqrshrn_n_s16(int16x8_t a) {
   return vqrshrn_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vqrshrn_n_s32
-// CHECK: vqrshrn.s32 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqrshrn_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 1)
+// CHECK:   ret <4 x i16> [[VQRSHRN_N]]1
 int16x4_t test_vqrshrn_n_s32(int32x4_t a) {
   return vqrshrn_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vqrshrn_n_s64
-// CHECK: vqrshrn.s64 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqrshrn_n_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 1)
+// CHECK:   ret <2 x i32> [[VQRSHRN_N]]1
 int32x2_t test_vqrshrn_n_s64(int64x2_t a) {
   return vqrshrn_n_s64(a, 1);
 }
 
-// CHECK-LABEL: test_vqrshrn_n_u16
-// CHECK: vqrshrn.u16 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqrshrn_n_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 1)
+// CHECK:   ret <8 x i8> [[VQRSHRN_N]]1
 uint8x8_t test_vqrshrn_n_u16(uint16x8_t a) {
   return vqrshrn_n_u16(a, 1);
 }
 
-// CHECK-LABEL: test_vqrshrn_n_u32
-// CHECK: vqrshrn.u32 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqrshrn_n_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 1)
+// CHECK:   ret <4 x i16> [[VQRSHRN_N]]1
 uint16x4_t test_vqrshrn_n_u32(uint32x4_t a) {
   return vqrshrn_n_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vqrshrn_n_u64
-// CHECK: vqrshrn.u64 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqrshrn_n_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 1)
+// CHECK:   ret <2 x i32> [[VQRSHRN_N]]1
 uint32x2_t test_vqrshrn_n_u64(uint64x2_t a) {
   return vqrshrn_n_u64(a, 1);
 }
 
 
-// CHECK-LABEL: test_vqrshrun_n_s16
-// CHECK: vqrshrun.s16 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqrshrun_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[VQRSHRUN_N]], i32 1)
+// CHECK:   ret <8 x i8> [[VQRSHRUN_N]]1
 uint8x8_t test_vqrshrun_n_s16(int16x8_t a) {
   return vqrshrun_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vqrshrun_n_s32
-// CHECK: vqrshrun.s32 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqrshrun_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[VQRSHRUN_N]], i32 1)
+// CHECK:   ret <4 x i16> [[VQRSHRUN_N]]1
 uint16x4_t test_vqrshrun_n_s32(int32x4_t a) {
   return vqrshrun_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vqrshrun_n_s64
-// CHECK: vqrshrun.s64 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqrshrun_n_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQRSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> [[VQRSHRUN_N]], i32 1)
+// CHECK:   ret <2 x i32> [[VQRSHRUN_N]]1
 uint32x2_t test_vqrshrun_n_s64(int64x2_t a) {
   return vqrshrun_n_s64(a, 1);
 }
 
 
-// CHECK-LABEL: test_vqshl_s8
-// CHECK: vqshl.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqshl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQSHL_V_I]]
 int8x8_t test_vqshl_s8(int8x8_t a, int8x8_t b) {
   return vqshl_s8(a, b);
 }
 
-// CHECK-LABEL: test_vqshl_s16
-// CHECK: vqshl.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqshl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[VQSHL_V_I]], <4 x i16> [[VQSHL_V1_I]]) #4
+// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqshl_s16(int16x4_t a, int16x4_t b) {
   return vqshl_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqshl_s32
-// CHECK: vqshl.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqshl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> [[VQSHL_V_I]], <2 x i32> [[VQSHL_V1_I]]) #4
+// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqshl_s32(int32x2_t a, int32x2_t b) {
   return vqshl_s32(a, b);
 }
 
-// CHECK-LABEL: test_vqshl_s64
-// CHECK: vqshl.s64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vqshl_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshl.v1i64(<1 x i64> [[VQSHL_V_I]], <1 x i64> [[VQSHL_V1_I]]) #4
+// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 int64x1_t test_vqshl_s64(int64x1_t a, int64x1_t b) {
   return vqshl_s64(a, b);
 }
 
-// CHECK-LABEL: test_vqshl_u8
-// CHECK: vqshl.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqshl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQSHL_V_I]]
 uint8x8_t test_vqshl_u8(uint8x8_t a, int8x8_t b) {
   return vqshl_u8(a, b);
 }
 
-// CHECK-LABEL: test_vqshl_u16
-// CHECK: vqshl.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqshl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[VQSHL_V_I]], <4 x i16> [[VQSHL_V1_I]]) #4
+// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vqshl_u16(uint16x4_t a, int16x4_t b) {
   return vqshl_u16(a, b);
 }
 
-// CHECK-LABEL: test_vqshl_u32
-// CHECK: vqshl.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqshl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> [[VQSHL_V_I]], <2 x i32> [[VQSHL_V1_I]]) #4
+// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vqshl_u32(uint32x2_t a, int32x2_t b) {
   return vqshl_u32(a, b);
 }
 
-// CHECK-LABEL: test_vqshl_u64
-// CHECK: vqshl.u64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vqshl_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqshl.v1i64(<1 x i64> [[VQSHL_V_I]], <1 x i64> [[VQSHL_V1_I]]) #4
+// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 uint64x1_t test_vqshl_u64(uint64x1_t a, int64x1_t b) {
   return vqshl_u64(a, b);
 }
 
-// CHECK-LABEL: test_vqshlq_s8
-// CHECK: vqshl.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vqshlq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQSHLQ_V_I]]
 int8x16_t test_vqshlq_s8(int8x16_t a, int8x16_t b) {
   return vqshlq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vqshlq_s16
-// CHECK: vqshl.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vqshlq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> [[VQSHLQ_V_I]], <8 x i16> [[VQSHLQ_V1_I]]) #4
+// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqshlq_s16(int16x8_t a, int16x8_t b) {
   return vqshlq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqshlq_s32
-// CHECK: vqshl.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqshlq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> [[VQSHLQ_V_I]], <4 x i32> [[VQSHLQ_V1_I]]) #4
+// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqshlq_s32(int32x4_t a, int32x4_t b) {
   return vqshlq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vqshlq_s64
-// CHECK: vqshl.s64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vqshlq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> [[VQSHLQ_V_I]], <2 x i64> [[VQSHLQ_V1_I]]) #4
+// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqshlq_s64(int64x2_t a, int64x2_t b) {
   return vqshlq_s64(a, b);
 }
 
-// CHECK-LABEL: test_vqshlq_u8
-// CHECK: vqshl.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vqshlq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQSHLQ_V_I]]
 uint8x16_t test_vqshlq_u8(uint8x16_t a, int8x16_t b) {
   return vqshlq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vqshlq_u16
-// CHECK: vqshl.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vqshlq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> [[VQSHLQ_V_I]], <8 x i16> [[VQSHLQ_V1_I]]) #4
+// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vqshlq_u16(uint16x8_t a, int16x8_t b) {
   return vqshlq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vqshlq_u32
-// CHECK: vqshl.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqshlq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> [[VQSHLQ_V_I]], <4 x i32> [[VQSHLQ_V1_I]]) #4
+// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vqshlq_u32(uint32x4_t a, int32x4_t b) {
   return vqshlq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vqshlq_u64
-// CHECK: vqshl.u64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vqshlq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> [[VQSHLQ_V_I]], <2 x i64> [[VQSHLQ_V1_I]]) #4
+// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 uint64x2_t test_vqshlq_u64(uint64x2_t a, int64x2_t b) {
   return vqshlq_u64(a, b);
 }
 
 
-// CHECK-LABEL: test_vqshlu_n_s8
-// CHECK: vqshlu.s8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqshlu_n_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VQSHLU_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshlu.v8i8(<8 x i8> %a, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+// CHECK:   ret <8 x i8> [[VQSHLU_N]]
 uint8x8_t test_vqshlu_n_s8(int8x8_t a) {
   return vqshlu_n_s8(a, 1);
 }
 
-// CHECK-LABEL: test_vqshlu_n_s16
-// CHECK: vqshlu.s16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqshlu_n_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQSHLU_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshlu.v4i16(<4 x i16> [[VQSHLU_N]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+// CHECK:   ret <4 x i16> [[VQSHLU_N]]1
 uint16x4_t test_vqshlu_n_s16(int16x4_t a) {
   return vqshlu_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vqshlu_n_s32
-// CHECK: vqshlu.s32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqshlu_n_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQSHLU_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshlu.v2i32(<2 x i32> [[VQSHLU_N]], <2 x i32> <i32 1, i32 1>)
+// CHECK:   ret <2 x i32> [[VQSHLU_N]]1
 uint32x2_t test_vqshlu_n_s32(int32x2_t a) {
   return vqshlu_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vqshlu_n_s64
-// CHECK: vqshlu.s64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vqshlu_n_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQSHLU_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshlu.v1i64(<1 x i64> [[VQSHLU_N]], <1 x i64> <i64 1>)
+// CHECK:   ret <1 x i64> [[VQSHLU_N]]1
 uint64x1_t test_vqshlu_n_s64(int64x1_t a) {
   return vqshlu_n_s64(a, 1);
 }
 
-// CHECK-LABEL: test_vqshluq_n_s8
-// CHECK: vqshlu.s8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vqshluq_n_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VQSHLU_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshlu.v16i8(<16 x i8> %a, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+// CHECK:   ret <16 x i8> [[VQSHLU_N]]
 uint8x16_t test_vqshluq_n_s8(int8x16_t a) {
   return vqshluq_n_s8(a, 1);
 }
 
-// CHECK-LABEL: test_vqshluq_n_s16
-// CHECK: vqshlu.s16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vqshluq_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHLU_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshlu.v8i16(<8 x i16> [[VQSHLU_N]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+// CHECK:   ret <8 x i16> [[VQSHLU_N]]1
 uint16x8_t test_vqshluq_n_s16(int16x8_t a) {
   return vqshluq_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vqshluq_n_s32
-// CHECK: vqshlu.s32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqshluq_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHLU_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshlu.v4i32(<4 x i32> [[VQSHLU_N]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+// CHECK:   ret <4 x i32> [[VQSHLU_N]]1
 uint32x4_t test_vqshluq_n_s32(int32x4_t a) {
   return vqshluq_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vqshluq_n_s64
-// CHECK: vqshlu.s64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vqshluq_n_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHLU_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshlu.v2i64(<2 x i64> [[VQSHLU_N]], <2 x i64> <i64 1, i64 1>)
+// CHECK:   ret <2 x i64> [[VQSHLU_N]]1
 uint64x2_t test_vqshluq_n_s64(int64x2_t a) {
   return vqshluq_n_s64(a, 1);
 }
 
 
-// CHECK-LABEL: test_vqshl_n_s8
-// CHECK: vqshl.s8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqshl_n_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VQSHL_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> %a, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+// CHECK:   ret <8 x i8> [[VQSHL_N]]
 int8x8_t test_vqshl_n_s8(int8x8_t a) {
   return vqshl_n_s8(a, 1);
 }
 
-// CHECK-LABEL: test_vqshl_n_s16
-// CHECK: vqshl.s16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqshl_n_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[VQSHL_N]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+// CHECK:   ret <4 x i16> [[VQSHL_N]]1
 int16x4_t test_vqshl_n_s16(int16x4_t a) {
   return vqshl_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vqshl_n_s32
-// CHECK: vqshl.s32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqshl_n_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> [[VQSHL_N]], <2 x i32> <i32 1, i32 1>)
+// CHECK:   ret <2 x i32> [[VQSHL_N]]1
 int32x2_t test_vqshl_n_s32(int32x2_t a) {
   return vqshl_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vqshl_n_s64
-// CHECK: vqshl.s64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vqshl_n_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshl.v1i64(<1 x i64> [[VQSHL_N]], <1 x i64> <i64 1>)
+// CHECK:   ret <1 x i64> [[VQSHL_N]]1
 int64x1_t test_vqshl_n_s64(int64x1_t a) {
   return vqshl_n_s64(a, 1);
 }
 
-// CHECK-LABEL: test_vqshl_n_u8
-// CHECK: vqshl.u8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqshl_n_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VQSHL_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> %a, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+// CHECK:   ret <8 x i8> [[VQSHL_N]]
 uint8x8_t test_vqshl_n_u8(uint8x8_t a) {
   return vqshl_n_u8(a, 1);
 }
 
-// CHECK-LABEL: test_vqshl_n_u16
-// CHECK: vqshl.u16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqshl_n_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[VQSHL_N]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+// CHECK:   ret <4 x i16> [[VQSHL_N]]1
 uint16x4_t test_vqshl_n_u16(uint16x4_t a) {
   return vqshl_n_u16(a, 1);
 }
 
-// CHECK-LABEL: test_vqshl_n_u32
-// CHECK: vqshl.u32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqshl_n_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> [[VQSHL_N]], <2 x i32> <i32 1, i32 1>)
+// CHECK:   ret <2 x i32> [[VQSHL_N]]1
 uint32x2_t test_vqshl_n_u32(uint32x2_t a) {
   return vqshl_n_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vqshl_n_u64
-// CHECK: vqshl.u64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vqshl_n_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqshl.v1i64(<1 x i64> [[VQSHL_N]], <1 x i64> <i64 1>)
+// CHECK:   ret <1 x i64> [[VQSHL_N]]1
 uint64x1_t test_vqshl_n_u64(uint64x1_t a) {
   return vqshl_n_u64(a, 1);
 }
 
-// CHECK-LABEL: test_vqshlq_n_s8
-// CHECK: vqshl.s8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vqshlq_n_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VQSHL_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> %a, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+// CHECK:   ret <16 x i8> [[VQSHL_N]]
 int8x16_t test_vqshlq_n_s8(int8x16_t a) {
   return vqshlq_n_s8(a, 1);
 }
 
-// CHECK-LABEL: test_vqshlq_n_s16
-// CHECK: vqshl.s16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vqshlq_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> [[VQSHL_N]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+// CHECK:   ret <8 x i16> [[VQSHL_N]]1
 int16x8_t test_vqshlq_n_s16(int16x8_t a) {
   return vqshlq_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vqshlq_n_s32
-// CHECK: vqshl.s32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqshlq_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> [[VQSHL_N]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+// CHECK:   ret <4 x i32> [[VQSHL_N]]1
 int32x4_t test_vqshlq_n_s32(int32x4_t a) {
   return vqshlq_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vqshlq_n_s64
-// CHECK: vqshl.s64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vqshlq_n_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> [[VQSHL_N]], <2 x i64> <i64 1, i64 1>)
+// CHECK:   ret <2 x i64> [[VQSHL_N]]1
 int64x2_t test_vqshlq_n_s64(int64x2_t a) {
   return vqshlq_n_s64(a, 1);
 }
 
-// CHECK-LABEL: test_vqshlq_n_u8
-// CHECK: vqshl.u8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vqshlq_n_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VQSHL_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> %a, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+// CHECK:   ret <16 x i8> [[VQSHL_N]]
 uint8x16_t test_vqshlq_n_u8(uint8x16_t a) {
   return vqshlq_n_u8(a, 1);
 }
 
-// CHECK-LABEL: test_vqshlq_n_u16
-// CHECK: vqshl.u16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vqshlq_n_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> [[VQSHL_N]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+// CHECK:   ret <8 x i16> [[VQSHL_N]]1
 uint16x8_t test_vqshlq_n_u16(uint16x8_t a) {
   return vqshlq_n_u16(a, 1);
 }
 
-// CHECK-LABEL: test_vqshlq_n_u32
-// CHECK: vqshl.u32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqshlq_n_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> [[VQSHL_N]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+// CHECK:   ret <4 x i32> [[VQSHL_N]]1
 uint32x4_t test_vqshlq_n_u32(uint32x4_t a) {
   return vqshlq_n_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vqshlq_n_u64
-// CHECK: vqshl.u64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vqshlq_n_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> [[VQSHL_N]], <2 x i64> <i64 1, i64 1>)
+// CHECK:   ret <2 x i64> [[VQSHL_N]]1
 uint64x2_t test_vqshlq_n_u64(uint64x2_t a) {
   return vqshlq_n_u64(a, 1);
 }
 
 
-// CHECK-LABEL: test_vqshrn_n_s16
-// CHECK: vqshrn.s16 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqshrn_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> [[VQSHRN_N]], i32 1)
+// CHECK:   ret <8 x i8> [[VQSHRN_N]]1
 int8x8_t test_vqshrn_n_s16(int16x8_t a) {
   return vqshrn_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vqshrn_n_s32
-// CHECK: vqshrn.s32 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqshrn_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> [[VQSHRN_N]], i32 1)
+// CHECK:   ret <4 x i16> [[VQSHRN_N]]1
 int16x4_t test_vqshrn_n_s32(int32x4_t a) {
   return vqshrn_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vqshrn_n_s64
-// CHECK: vqshrn.s64 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqshrn_n_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> [[VQSHRN_N]], i32 1)
+// CHECK:   ret <2 x i32> [[VQSHRN_N]]1
 int32x2_t test_vqshrn_n_s64(int64x2_t a) {
   return vqshrn_n_s64(a, 1);
 }
 
-// CHECK-LABEL: test_vqshrn_n_u16
-// CHECK: vqshrn.u16 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqshrn_n_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> [[VQSHRN_N]], i32 1)
+// CHECK:   ret <8 x i8> [[VQSHRN_N]]1
 uint8x8_t test_vqshrn_n_u16(uint16x8_t a) {
   return vqshrn_n_u16(a, 1);
 }
 
-// CHECK-LABEL: test_vqshrn_n_u32
-// CHECK: vqshrn.u32 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqshrn_n_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> [[VQSHRN_N]], i32 1)
+// CHECK:   ret <4 x i16> [[VQSHRN_N]]1
 uint16x4_t test_vqshrn_n_u32(uint32x4_t a) {
   return vqshrn_n_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vqshrn_n_u64
-// CHECK: vqshrn.u64 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqshrn_n_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> [[VQSHRN_N]], i32 1)
+// CHECK:   ret <2 x i32> [[VQSHRN_N]]1
 uint32x2_t test_vqshrn_n_u64(uint64x2_t a) {
   return vqshrn_n_u64(a, 1);
 }
 
 
-// CHECK-LABEL: test_vqshrun_n_s16
-// CHECK: vqshrun.s16 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqshrun_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> [[VQSHRUN_N]], i32 1)
+// CHECK:   ret <8 x i8> [[VQSHRUN_N]]1
 uint8x8_t test_vqshrun_n_s16(int16x8_t a) {
   return vqshrun_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vqshrun_n_s32
-// CHECK: vqshrun.s32 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqshrun_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> [[VQSHRUN_N]], i32 1)
+// CHECK:   ret <4 x i16> [[VQSHRUN_N]]1
 uint16x4_t test_vqshrun_n_s32(int32x4_t a) {
   return vqshrun_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vqshrun_n_s64
-// CHECK: vqshrun.s64 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqshrun_n_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> [[VQSHRUN_N]], i32 1)
+// CHECK:   ret <2 x i32> [[VQSHRUN_N]]1
 uint32x2_t test_vqshrun_n_s64(int64x2_t a) {
   return vqshrun_n_s64(a, 1);
 }
 
 
-// CHECK-LABEL: test_vqsub_s8
-// CHECK: vqsub.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqsub_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqsub.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQSUB_V_I]]
 int8x8_t test_vqsub_s8(int8x8_t a, int8x8_t b) {
   return vqsub_s8(a, b);
 }
 
-// CHECK-LABEL: test_vqsub_s16
-// CHECK: vqsub.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqsub_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> [[VQSUB_V_I]], <4 x i16> [[VQSUB_V1_I]]) #4
+// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqsub_s16(int16x4_t a, int16x4_t b) {
   return vqsub_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqsub_s32
-// CHECK: vqsub.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqsub_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> [[VQSUB_V_I]], <2 x i32> [[VQSUB_V1_I]]) #4
+// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqsub_s32(int32x2_t a, int32x2_t b) {
   return vqsub_s32(a, b);
 }
 
-// CHECK-LABEL: test_vqsub_s64
-// CHECK: vqsub.s64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vqsub_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqsub.v1i64(<1 x i64> [[VQSUB_V_I]], <1 x i64> [[VQSUB_V1_I]]) #4
+// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 int64x1_t test_vqsub_s64(int64x1_t a, int64x1_t b) {
   return vqsub_s64(a, b);
 }
 
-// CHECK-LABEL: test_vqsub_u8
-// CHECK: vqsub.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqsub_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqsub.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQSUB_V_I]]
 uint8x8_t test_vqsub_u8(uint8x8_t a, uint8x8_t b) {
   return vqsub_u8(a, b);
 }
 
-// CHECK-LABEL: test_vqsub_u16
-// CHECK: vqsub.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqsub_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqsub.v4i16(<4 x i16> [[VQSUB_V_I]], <4 x i16> [[VQSUB_V1_I]]) #4
+// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vqsub_u16(uint16x4_t a, uint16x4_t b) {
   return vqsub_u16(a, b);
 }
 
-// CHECK-LABEL: test_vqsub_u32
-// CHECK: vqsub.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqsub_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqsub.v2i32(<2 x i32> [[VQSUB_V_I]], <2 x i32> [[VQSUB_V1_I]]) #4
+// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vqsub_u32(uint32x2_t a, uint32x2_t b) {
   return vqsub_u32(a, b);
 }
 
-// CHECK-LABEL: test_vqsub_u64
-// CHECK: vqsub.u64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vqsub_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqsub.v1i64(<1 x i64> [[VQSUB_V_I]], <1 x i64> [[VQSUB_V1_I]]) #4
+// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 uint64x1_t test_vqsub_u64(uint64x1_t a, uint64x1_t b) {
   return vqsub_u64(a, b);
 }
 
-// CHECK-LABEL: test_vqsubq_s8
-// CHECK: vqsub.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vqsubq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqsub.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQSUBQ_V_I]]
 int8x16_t test_vqsubq_s8(int8x16_t a, int8x16_t b) {
   return vqsubq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vqsubq_s16
-// CHECK: vqsub.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vqsubq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> [[VQSUBQ_V_I]], <8 x i16> [[VQSUBQ_V1_I]]) #4
+// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqsubq_s16(int16x8_t a, int16x8_t b) {
   return vqsubq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqsubq_s32
-// CHECK: vqsub.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqsubq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQSUBQ_V_I]], <4 x i32> [[VQSUBQ_V1_I]]) #4
+// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqsubq_s32(int32x4_t a, int32x4_t b) {
   return vqsubq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vqsubq_s64
-// CHECK: vqsub.s64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vqsubq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQSUBQ_V_I]], <2 x i64> [[VQSUBQ_V1_I]]) #4
+// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqsubq_s64(int64x2_t a, int64x2_t b) {
   return vqsubq_s64(a, b);
 }
 
-// CHECK-LABEL: test_vqsubq_u8
-// CHECK: vqsub.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vqsubq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqsub.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQSUBQ_V_I]]
 uint8x16_t test_vqsubq_u8(uint8x16_t a, uint8x16_t b) {
   return vqsubq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vqsubq_u16
-// CHECK: vqsub.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vqsubq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqsub.v8i16(<8 x i16> [[VQSUBQ_V_I]], <8 x i16> [[VQSUBQ_V1_I]]) #4
+// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vqsubq_u16(uint16x8_t a, uint16x8_t b) {
   return vqsubq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vqsubq_u32
-// CHECK: vqsub.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqsubq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqsub.v4i32(<4 x i32> [[VQSUBQ_V_I]], <4 x i32> [[VQSUBQ_V1_I]]) #4
+// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vqsubq_u32(uint32x4_t a, uint32x4_t b) {
   return vqsubq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vqsubq_u64
-// CHECK: vqsub.u64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vqsubq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqsub.v2i64(<2 x i64> [[VQSUBQ_V_I]], <2 x i64> [[VQSUBQ_V1_I]]) #4
+// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 uint64x2_t test_vqsubq_u64(uint64x2_t a, uint64x2_t b) {
   return vqsubq_u64(a, b);
 }
 
 
-// CHECK-LABEL: test_vraddhn_s16
-// CHECK: vraddhn.i16 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vraddhn_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> [[VRADDHN_V_I]], <8 x i16> [[VRADDHN_V1_I]]) #4
+// CHECK:   ret <8 x i8> [[VRADDHN_V2_I]]
 int8x8_t test_vraddhn_s16(int16x8_t a, int16x8_t b) {
   return vraddhn_s16(a, b);
 }
 
-// CHECK-LABEL: test_vraddhn_s32
-// CHECK: vraddhn.i32 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vraddhn_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> [[VRADDHN_V_I]], <4 x i32> [[VRADDHN_V1_I]]) #4
+// CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vraddhn_s32(int32x4_t a, int32x4_t b) {
   return vraddhn_s32(a, b);
 }
 
-// CHECK-LABEL: test_vraddhn_s64
-// CHECK: vraddhn.i64 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vraddhn_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> [[VRADDHN_V_I]], <2 x i64> [[VRADDHN_V1_I]]) #4
+// CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vraddhn_s64(int64x2_t a, int64x2_t b) {
   return vraddhn_s64(a, b);
 }
 
-// CHECK-LABEL: test_vraddhn_u16
-// CHECK: vraddhn.i16 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vraddhn_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> [[VRADDHN_V_I]], <8 x i16> [[VRADDHN_V1_I]]) #4
+// CHECK:   ret <8 x i8> [[VRADDHN_V2_I]]
 uint8x8_t test_vraddhn_u16(uint16x8_t a, uint16x8_t b) {
   return vraddhn_u16(a, b);
 }
 
-// CHECK-LABEL: test_vraddhn_u32
-// CHECK: vraddhn.i32 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vraddhn_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> [[VRADDHN_V_I]], <4 x i32> [[VRADDHN_V1_I]]) #4
+// CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vraddhn_u32(uint32x4_t a, uint32x4_t b) {
   return vraddhn_u32(a, b);
 }
 
-// CHECK-LABEL: test_vraddhn_u64
-// CHECK: vraddhn.i64 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vraddhn_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> [[VRADDHN_V_I]], <2 x i64> [[VRADDHN_V1_I]]) #4
+// CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vraddhn_u64(uint64x2_t a, uint64x2_t b) {
   return vraddhn_u64(a, b);
 }
 
 
-// CHECK-LABEL: test_vrecpe_f32
-// CHECK: vrecpe.f32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vrecpe_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRECPE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRECPE_V1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frecpe.v2f32(<2 x float> [[VRECPE_V_I]]) #4
+// CHECK:   ret <2 x float> [[VRECPE_V1_I]]
 float32x2_t test_vrecpe_f32(float32x2_t a) {
   return vrecpe_f32(a);
 }
 
-// CHECK-LABEL: test_vrecpe_u32
-// CHECK: vrecpe.u32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vrecpe_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VRECPE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VRECPE_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.urecpe.v2i32(<2 x i32> [[VRECPE_V_I]]) #4
+// CHECK:   ret <2 x i32> [[VRECPE_V1_I]]
 uint32x2_t test_vrecpe_u32(uint32x2_t a) {
   return vrecpe_u32(a);
 }
 
-// CHECK-LABEL: test_vrecpeq_f32
-// CHECK: vrecpe.f32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vrecpeq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRECPEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRECPEQ_V1_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frecpe.v4f32(<4 x float> [[VRECPEQ_V_I]]) #4
+// CHECK:   ret <4 x float> [[VRECPEQ_V1_I]]
 float32x4_t test_vrecpeq_f32(float32x4_t a) {
   return vrecpeq_f32(a);
 }
 
-// CHECK-LABEL: test_vrecpeq_u32
-// CHECK: vrecpe.u32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vrecpeq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VRECPEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRECPEQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.urecpe.v4i32(<4 x i32> [[VRECPEQ_V_I]]) #4
+// CHECK:   ret <4 x i32> [[VRECPEQ_V1_I]]
 uint32x4_t test_vrecpeq_u32(uint32x4_t a) {
   return vrecpeq_u32(a);
 }
 
 
-// CHECK-LABEL: test_vrecps_f32
-// CHECK: vrecps.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vrecps_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VRECPS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRECPS_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VRECPS_V2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frecps.v2f32(<2 x float> [[VRECPS_V_I]], <2 x float> [[VRECPS_V1_I]]) #4
+// CHECK:   [[VRECPS_V3_I:%.*]] = bitcast <2 x float> [[VRECPS_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRECPS_V3_I]] to <2 x float>
+// CHECK:   ret <2 x float> [[TMP2]]
 float32x2_t test_vrecps_f32(float32x2_t a, float32x2_t b) {
   return vrecps_f32(a, b);
 }
 
-// CHECK-LABEL: test_vrecpsq_f32
-// CHECK: vrecps.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vrecpsq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VRECPSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRECPSQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VRECPSQ_V2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frecps.v4f32(<4 x float> [[VRECPSQ_V_I]], <4 x float> [[VRECPSQ_V1_I]]) #4
+// CHECK:   [[VRECPSQ_V3_I:%.*]] = bitcast <4 x float> [[VRECPSQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRECPSQ_V3_I]] to <4 x float>
+// CHECK:   ret <4 x float> [[TMP2]]
 float32x4_t test_vrecpsq_f32(float32x4_t a, float32x4_t b) {
   return vrecpsq_f32(a, b);
 }
 
 
-// CHECK-LABEL: test_vreinterpret_s8_s16
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_s16(int16x4_t a) {
   return vreinterpret_s8_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_s32
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_s32(int32x2_t a) {
   return vreinterpret_s8_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_s64
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_s64(int64x1_t a) {
   return vreinterpret_s8_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_u8
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u8(<8 x i8> %a) #0 {
+// CHECK:   ret <8 x i8> %a
 int8x8_t test_vreinterpret_s8_u8(uint8x8_t a) {
   return vreinterpret_s8_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_u16
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_u16(uint16x4_t a) {
   return vreinterpret_s8_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_u32
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_u32(uint32x2_t a) {
   return vreinterpret_s8_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_u64
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_u64(uint64x1_t a) {
   return vreinterpret_s8_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_f16
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_f16(float16x4_t a) {
   return vreinterpret_s8_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_f32
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_f32(float32x2_t a) {
   return vreinterpret_s8_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_p8
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_p8(<8 x i8> %a) #0 {
+// CHECK:   ret <8 x i8> %a
 int8x8_t test_vreinterpret_s8_p8(poly8x8_t a) {
   return vreinterpret_s8_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_p16
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_p16(poly16x4_t a) {
   return vreinterpret_s8_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_s8
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_s8(int8x8_t a) {
   return vreinterpret_s16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_s32
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_s32(int32x2_t a) {
   return vreinterpret_s16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_s64
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_s64(int64x1_t a) {
   return vreinterpret_s16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_u8
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_u8(uint8x8_t a) {
   return vreinterpret_s16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_u16
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u16(<4 x i16> %a) #0 {
+// CHECK:   ret <4 x i16> %a
 int16x4_t test_vreinterpret_s16_u16(uint16x4_t a) {
   return vreinterpret_s16_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_u32
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_u32(uint32x2_t a) {
   return vreinterpret_s16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_u64
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_u64(uint64x1_t a) {
   return vreinterpret_s16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_f16
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_f16(float16x4_t a) {
   return vreinterpret_s16_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_f32
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_f32(float32x2_t a) {
   return vreinterpret_s16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_p8
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_p8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_p8(poly8x8_t a) {
   return vreinterpret_s16_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_p16
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_p16(<4 x i16> %a) #0 {
+// CHECK:   ret <4 x i16> %a
 int16x4_t test_vreinterpret_s16_p16(poly16x4_t a) {
   return vreinterpret_s16_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_s8
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_s8(int8x8_t a) {
   return vreinterpret_s32_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_s16
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_s16(int16x4_t a) {
   return vreinterpret_s32_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_s64
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_s64(int64x1_t a) {
   return vreinterpret_s32_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_u8
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_u8(uint8x8_t a) {
   return vreinterpret_s32_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_u16
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_u16(uint16x4_t a) {
   return vreinterpret_s32_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_u32
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u32(<2 x i32> %a) #0 {
+// CHECK:   ret <2 x i32> %a
 int32x2_t test_vreinterpret_s32_u32(uint32x2_t a) {
   return vreinterpret_s32_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_u64
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_u64(uint64x1_t a) {
   return vreinterpret_s32_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_f16
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_f16(float16x4_t a) {
   return vreinterpret_s32_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_f32
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_f32(float32x2_t a) {
   return vreinterpret_s32_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_p8
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_p8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_p8(poly8x8_t a) {
   return vreinterpret_s32_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_p16
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_p16(poly16x4_t a) {
   return vreinterpret_s32_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_s8
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_s8(int8x8_t a) {
   return vreinterpret_s64_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_s16
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_s16(int16x4_t a) {
   return vreinterpret_s64_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_s32
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_s32(int32x2_t a) {
   return vreinterpret_s64_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_u8
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_u8(uint8x8_t a) {
   return vreinterpret_s64_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_u16
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_u16(uint16x4_t a) {
   return vreinterpret_s64_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_u32
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_u32(uint32x2_t a) {
   return vreinterpret_s64_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_u64
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u64(<1 x i64> %a) #0 {
+// CHECK:   ret <1 x i64> %a
 int64x1_t test_vreinterpret_s64_u64(uint64x1_t a) {
   return vreinterpret_s64_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_f16
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_f16(float16x4_t a) {
   return vreinterpret_s64_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_f32
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_f32(float32x2_t a) {
   return vreinterpret_s64_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_p8
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_p8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_p8(poly8x8_t a) {
   return vreinterpret_s64_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_p16
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_p16(poly16x4_t a) {
   return vreinterpret_s64_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_s8
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s8(<8 x i8> %a) #0 {
+// CHECK:   ret <8 x i8> %a
 uint8x8_t test_vreinterpret_u8_s8(int8x8_t a) {
   return vreinterpret_u8_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_s16
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_s16(int16x4_t a) {
   return vreinterpret_u8_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_s32
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_s32(int32x2_t a) {
   return vreinterpret_u8_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_s64
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_s64(int64x1_t a) {
   return vreinterpret_u8_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_u16
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_u16(uint16x4_t a) {
   return vreinterpret_u8_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_u32
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_u32(uint32x2_t a) {
   return vreinterpret_u8_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_u64
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_u64(uint64x1_t a) {
   return vreinterpret_u8_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_f16
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_f16(float16x4_t a) {
   return vreinterpret_u8_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_f32
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_f32(float32x2_t a) {
   return vreinterpret_u8_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_p8
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_p8(<8 x i8> %a) #0 {
+// CHECK:   ret <8 x i8> %a
 uint8x8_t test_vreinterpret_u8_p8(poly8x8_t a) {
   return vreinterpret_u8_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_p16
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_p16(poly16x4_t a) {
   return vreinterpret_u8_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_s8
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_s8(int8x8_t a) {
   return vreinterpret_u16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_s16
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s16(<4 x i16> %a) #0 {
+// CHECK:   ret <4 x i16> %a
 uint16x4_t test_vreinterpret_u16_s16(int16x4_t a) {
   return vreinterpret_u16_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_s32
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_s32(int32x2_t a) {
   return vreinterpret_u16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_s64
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_s64(int64x1_t a) {
   return vreinterpret_u16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_u8
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_u8(uint8x8_t a) {
   return vreinterpret_u16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_u32
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_u32(uint32x2_t a) {
   return vreinterpret_u16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_u64
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_u64(uint64x1_t a) {
   return vreinterpret_u16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_f16
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_f16(float16x4_t a) {
   return vreinterpret_u16_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_f32
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_f32(float32x2_t a) {
   return vreinterpret_u16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_p8
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_p8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_p8(poly8x8_t a) {
   return vreinterpret_u16_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_p16
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_p16(<4 x i16> %a) #0 {
+// CHECK:   ret <4 x i16> %a
 uint16x4_t test_vreinterpret_u16_p16(poly16x4_t a) {
   return vreinterpret_u16_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_s8
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_s8(int8x8_t a) {
   return vreinterpret_u32_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_s16
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_s16(int16x4_t a) {
   return vreinterpret_u32_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_s32
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s32(<2 x i32> %a) #0 {
+// CHECK:   ret <2 x i32> %a
 uint32x2_t test_vreinterpret_u32_s32(int32x2_t a) {
   return vreinterpret_u32_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_s64
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_s64(int64x1_t a) {
   return vreinterpret_u32_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_u8
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_u8(uint8x8_t a) {
   return vreinterpret_u32_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_u16
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_u16(uint16x4_t a) {
   return vreinterpret_u32_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_u64
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_u64(uint64x1_t a) {
   return vreinterpret_u32_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_f16
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_f16(float16x4_t a) {
   return vreinterpret_u32_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_f32
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_f32(float32x2_t a) {
   return vreinterpret_u32_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_p8
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_p8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_p8(poly8x8_t a) {
   return vreinterpret_u32_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_p16
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_p16(poly16x4_t a) {
   return vreinterpret_u32_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_s8
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_s8(int8x8_t a) {
   return vreinterpret_u64_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_s16
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_s16(int16x4_t a) {
   return vreinterpret_u64_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_s32
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_s32(int32x2_t a) {
   return vreinterpret_u64_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_s64
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s64(<1 x i64> %a) #0 {
+// CHECK:   ret <1 x i64> %a
 uint64x1_t test_vreinterpret_u64_s64(int64x1_t a) {
   return vreinterpret_u64_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_u8
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_u8(uint8x8_t a) {
   return vreinterpret_u64_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_u16
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_u16(uint16x4_t a) {
   return vreinterpret_u64_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_u32
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_u32(uint32x2_t a) {
   return vreinterpret_u64_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_f16
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_f16(float16x4_t a) {
   return vreinterpret_u64_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_f32
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_f32(float32x2_t a) {
   return vreinterpret_u64_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_p8
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_p8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_p8(poly8x8_t a) {
   return vreinterpret_u64_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_p16
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_p16(poly16x4_t a) {
   return vreinterpret_u64_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_s8
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_s8(int8x8_t a) {
   return vreinterpret_f16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_s16
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_s16(int16x4_t a) {
   return vreinterpret_f16_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_s32
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_s32(int32x2_t a) {
   return vreinterpret_f16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_s64
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_s64(int64x1_t a) {
   return vreinterpret_f16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_u8
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_u8(uint8x8_t a) {
   return vreinterpret_f16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_u16
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_u16(uint16x4_t a) {
   return vreinterpret_f16_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_u32
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_u32(uint32x2_t a) {
   return vreinterpret_f16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_u64
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_u64(uint64x1_t a) {
   return vreinterpret_f16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_f32
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_f32(float32x2_t a) {
   return vreinterpret_f16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_p8
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_p8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_p8(poly8x8_t a) {
   return vreinterpret_f16_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_p16
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_p16(poly16x4_t a) {
   return vreinterpret_f16_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_s8
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_s8(int8x8_t a) {
   return vreinterpret_f32_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_s16
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_s16(int16x4_t a) {
   return vreinterpret_f32_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_s32
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_s32(int32x2_t a) {
   return vreinterpret_f32_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_s64
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_s64(int64x1_t a) {
   return vreinterpret_f32_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_u8
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_u8(uint8x8_t a) {
   return vreinterpret_f32_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_u16
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_u16(uint16x4_t a) {
   return vreinterpret_f32_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_u32
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_u32(uint32x2_t a) {
   return vreinterpret_f32_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_u64
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_u64(uint64x1_t a) {
   return vreinterpret_f32_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_f16
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_f16(float16x4_t a) {
   return vreinterpret_f32_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_p8
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_p8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_p8(poly8x8_t a) {
   return vreinterpret_f32_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_p16
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_p16(poly16x4_t a) {
   return vreinterpret_f32_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_s8
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s8(<8 x i8> %a) #0 {
+// CHECK:   ret <8 x i8> %a
 poly8x8_t test_vreinterpret_p8_s8(int8x8_t a) {
   return vreinterpret_p8_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_s16
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_s16(int16x4_t a) {
   return vreinterpret_p8_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_s32
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_s32(int32x2_t a) {
   return vreinterpret_p8_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_s64
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_s64(int64x1_t a) {
   return vreinterpret_p8_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_u8
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u8(<8 x i8> %a) #0 {
+// CHECK:   ret <8 x i8> %a
 poly8x8_t test_vreinterpret_p8_u8(uint8x8_t a) {
   return vreinterpret_p8_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_u16
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_u16(uint16x4_t a) {
   return vreinterpret_p8_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_u32
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_u32(uint32x2_t a) {
   return vreinterpret_p8_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_u64
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_u64(uint64x1_t a) {
   return vreinterpret_p8_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_f16
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_f16(float16x4_t a) {
   return vreinterpret_p8_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_f32
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_f32(float32x2_t a) {
   return vreinterpret_p8_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_p16
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_p16(poly16x4_t a) {
   return vreinterpret_p8_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_s8
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_s8(int8x8_t a) {
   return vreinterpret_p16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_s16
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s16(<4 x i16> %a) #0 {
+// CHECK:   ret <4 x i16> %a
 poly16x4_t test_vreinterpret_p16_s16(int16x4_t a) {
   return vreinterpret_p16_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_s32
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_s32(int32x2_t a) {
   return vreinterpret_p16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_s64
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_s64(int64x1_t a) {
   return vreinterpret_p16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_u8
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_u8(uint8x8_t a) {
   return vreinterpret_p16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_u16
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u16(<4 x i16> %a) #0 {
+// CHECK:   ret <4 x i16> %a
 poly16x4_t test_vreinterpret_p16_u16(uint16x4_t a) {
   return vreinterpret_p16_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_u32
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_u32(uint32x2_t a) {
   return vreinterpret_p16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_u64
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_u64(uint64x1_t a) {
   return vreinterpret_p16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_f16
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_f16(float16x4_t a) {
   return vreinterpret_p16_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_f32
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_f32(float32x2_t a) {
   return vreinterpret_p16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_p8
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_p8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_p8(poly8x8_t a) {
   return vreinterpret_p16_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_s16
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_s16(int16x8_t a) {
   return vreinterpretq_s8_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_s32
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_s32(int32x4_t a) {
   return vreinterpretq_s8_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_s64
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_s64(int64x2_t a) {
   return vreinterpretq_s8_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_u8
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u8(<16 x i8> %a) #0 {
+// CHECK:   ret <16 x i8> %a
 int8x16_t test_vreinterpretq_s8_u8(uint8x16_t a) {
   return vreinterpretq_s8_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_u16
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_u16(uint16x8_t a) {
   return vreinterpretq_s8_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_u32
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_u32(uint32x4_t a) {
   return vreinterpretq_s8_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_u64
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_u64(uint64x2_t a) {
   return vreinterpretq_s8_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_f16
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_f16(float16x8_t a) {
   return vreinterpretq_s8_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_f32
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_f32(float32x4_t a) {
   return vreinterpretq_s8_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_p8
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_p8(<16 x i8> %a) #0 {
+// CHECK:   ret <16 x i8> %a
 int8x16_t test_vreinterpretq_s8_p8(poly8x16_t a) {
   return vreinterpretq_s8_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_p16
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_p16(poly16x8_t a) {
   return vreinterpretq_s8_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_s8
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_s8(int8x16_t a) {
   return vreinterpretq_s16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_s32
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_s32(int32x4_t a) {
   return vreinterpretq_s16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_s64
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_s64(int64x2_t a) {
   return vreinterpretq_s16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_u8
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_u8(uint8x16_t a) {
   return vreinterpretq_s16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_u16
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u16(<8 x i16> %a) #0 {
+// CHECK:   ret <8 x i16> %a
 int16x8_t test_vreinterpretq_s16_u16(uint16x8_t a) {
   return vreinterpretq_s16_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_u32
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_u32(uint32x4_t a) {
   return vreinterpretq_s16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_u64
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_u64(uint64x2_t a) {
   return vreinterpretq_s16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_f16
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_f16(float16x8_t a) {
   return vreinterpretq_s16_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_f32
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_f32(float32x4_t a) {
   return vreinterpretq_s16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_p8
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_p8(poly8x16_t a) {
   return vreinterpretq_s16_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_p16
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_p16(<8 x i16> %a) #0 {
+// CHECK:   ret <8 x i16> %a
 int16x8_t test_vreinterpretq_s16_p16(poly16x8_t a) {
   return vreinterpretq_s16_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_s8
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_s8(int8x16_t a) {
   return vreinterpretq_s32_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_s16
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_s16(int16x8_t a) {
   return vreinterpretq_s32_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_s64
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_s64(int64x2_t a) {
   return vreinterpretq_s32_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_u8
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_u8(uint8x16_t a) {
   return vreinterpretq_s32_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_u16
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_u16(uint16x8_t a) {
   return vreinterpretq_s32_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_u32
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u32(<4 x i32> %a) #0 {
+// CHECK:   ret <4 x i32> %a
 int32x4_t test_vreinterpretq_s32_u32(uint32x4_t a) {
   return vreinterpretq_s32_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_u64
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_u64(uint64x2_t a) {
   return vreinterpretq_s32_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_f16
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_f16(float16x8_t a) {
   return vreinterpretq_s32_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_f32
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_f32(float32x4_t a) {
   return vreinterpretq_s32_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_p8
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_p8(poly8x16_t a) {
   return vreinterpretq_s32_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_p16
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_p16(poly16x8_t a) {
   return vreinterpretq_s32_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_s8
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_s8(int8x16_t a) {
   return vreinterpretq_s64_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_s16
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_s16(int16x8_t a) {
   return vreinterpretq_s64_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_s32
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_s32(int32x4_t a) {
   return vreinterpretq_s64_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_u8
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_u8(uint8x16_t a) {
   return vreinterpretq_s64_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_u16
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_u16(uint16x8_t a) {
   return vreinterpretq_s64_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_u32
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_u32(uint32x4_t a) {
   return vreinterpretq_s64_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_u64
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u64(<2 x i64> %a) #0 {
+// CHECK:   ret <2 x i64> %a
 int64x2_t test_vreinterpretq_s64_u64(uint64x2_t a) {
   return vreinterpretq_s64_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_f16
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_f16(float16x8_t a) {
   return vreinterpretq_s64_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_f32
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_f32(float32x4_t a) {
   return vreinterpretq_s64_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_p8
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_p8(poly8x16_t a) {
   return vreinterpretq_s64_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_p16
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_p16(poly16x8_t a) {
   return vreinterpretq_s64_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_s8
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s8(<16 x i8> %a) #0 {
+// CHECK:   ret <16 x i8> %a
 uint8x16_t test_vreinterpretq_u8_s8(int8x16_t a) {
   return vreinterpretq_u8_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_s16
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_s16(int16x8_t a) {
   return vreinterpretq_u8_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_s32
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_s32(int32x4_t a) {
   return vreinterpretq_u8_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_s64
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_s64(int64x2_t a) {
   return vreinterpretq_u8_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_u16
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_u16(uint16x8_t a) {
   return vreinterpretq_u8_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_u32
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_u32(uint32x4_t a) {
   return vreinterpretq_u8_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_u64
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_u64(uint64x2_t a) {
   return vreinterpretq_u8_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_f16
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_f16(float16x8_t a) {
   return vreinterpretq_u8_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_f32
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_f32(float32x4_t a) {
   return vreinterpretq_u8_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_p8
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_p8(<16 x i8> %a) #0 {
+// CHECK:   ret <16 x i8> %a
 uint8x16_t test_vreinterpretq_u8_p8(poly8x16_t a) {
   return vreinterpretq_u8_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_p16
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_p16(poly16x8_t a) {
   return vreinterpretq_u8_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_s8
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_s8(int8x16_t a) {
   return vreinterpretq_u16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_s16
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s16(<8 x i16> %a) #0 {
+// CHECK:   ret <8 x i16> %a
 uint16x8_t test_vreinterpretq_u16_s16(int16x8_t a) {
   return vreinterpretq_u16_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_s32
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_s32(int32x4_t a) {
   return vreinterpretq_u16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_s64
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_s64(int64x2_t a) {
   return vreinterpretq_u16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_u8
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_u8(uint8x16_t a) {
   return vreinterpretq_u16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_u32
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_u32(uint32x4_t a) {
   return vreinterpretq_u16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_u64
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_u64(uint64x2_t a) {
   return vreinterpretq_u16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_f16
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_f16(float16x8_t a) {
   return vreinterpretq_u16_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_f32
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_f32(float32x4_t a) {
   return vreinterpretq_u16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_p8
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_p8(poly8x16_t a) {
   return vreinterpretq_u16_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_p16
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_p16(<8 x i16> %a) #0 {
+// CHECK:   ret <8 x i16> %a
 uint16x8_t test_vreinterpretq_u16_p16(poly16x8_t a) {
   return vreinterpretq_u16_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_s8
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_s8(int8x16_t a) {
   return vreinterpretq_u32_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_s16
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_s16(int16x8_t a) {
   return vreinterpretq_u32_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_s32
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s32(<4 x i32> %a) #0 {
+// CHECK:   ret <4 x i32> %a
 uint32x4_t test_vreinterpretq_u32_s32(int32x4_t a) {
   return vreinterpretq_u32_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_s64
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_s64(int64x2_t a) {
   return vreinterpretq_u32_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_u8
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_u8(uint8x16_t a) {
   return vreinterpretq_u32_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_u16
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_u16(uint16x8_t a) {
   return vreinterpretq_u32_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_u64
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_u64(uint64x2_t a) {
   return vreinterpretq_u32_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_f16
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_f16(float16x8_t a) {
   return vreinterpretq_u32_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_f32
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_f32(float32x4_t a) {
   return vreinterpretq_u32_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_p8
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_p8(poly8x16_t a) {
   return vreinterpretq_u32_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_p16
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_p16(poly16x8_t a) {
   return vreinterpretq_u32_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_s8
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_s8(int8x16_t a) {
   return vreinterpretq_u64_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_s16
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_s16(int16x8_t a) {
   return vreinterpretq_u64_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_s32
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_s32(int32x4_t a) {
   return vreinterpretq_u64_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_s64
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s64(<2 x i64> %a) #0 {
+// CHECK:   ret <2 x i64> %a
 uint64x2_t test_vreinterpretq_u64_s64(int64x2_t a) {
   return vreinterpretq_u64_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_u8
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_u8(uint8x16_t a) {
   return vreinterpretq_u64_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_u16
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_u16(uint16x8_t a) {
   return vreinterpretq_u64_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_u32
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_u32(uint32x4_t a) {
   return vreinterpretq_u64_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_f16
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_f16(float16x8_t a) {
   return vreinterpretq_u64_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_f32
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_f32(float32x4_t a) {
   return vreinterpretq_u64_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_p8
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_p8(poly8x16_t a) {
   return vreinterpretq_u64_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_p16
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_p16(poly16x8_t a) {
   return vreinterpretq_u64_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_s8
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_s8(int8x16_t a) {
   return vreinterpretq_f16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_s16
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_s16(int16x8_t a) {
   return vreinterpretq_f16_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_s32
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_s32(int32x4_t a) {
   return vreinterpretq_f16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_s64
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_s64(int64x2_t a) {
   return vreinterpretq_f16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_u8
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_u8(uint8x16_t a) {
   return vreinterpretq_f16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_u16
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_u16(uint16x8_t a) {
   return vreinterpretq_f16_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_u32
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_u32(uint32x4_t a) {
   return vreinterpretq_f16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_u64
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_u64(uint64x2_t a) {
   return vreinterpretq_f16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_f32
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_f32(float32x4_t a) {
   return vreinterpretq_f16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_p8
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_p8(poly8x16_t a) {
   return vreinterpretq_f16_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_p16
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_p16(poly16x8_t a) {
   return vreinterpretq_f16_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_s8
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_s8(int8x16_t a) {
   return vreinterpretq_f32_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_s16
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_s16(int16x8_t a) {
   return vreinterpretq_f32_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_s32
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_s32(int32x4_t a) {
   return vreinterpretq_f32_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_s64
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_s64(int64x2_t a) {
   return vreinterpretq_f32_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_u8
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_u8(uint8x16_t a) {
   return vreinterpretq_f32_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_u16
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_u16(uint16x8_t a) {
   return vreinterpretq_f32_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_u32
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_u32(uint32x4_t a) {
   return vreinterpretq_f32_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_u64
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_u64(uint64x2_t a) {
   return vreinterpretq_f32_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_f16
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_f16(float16x8_t a) {
   return vreinterpretq_f32_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_p8
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_p8(poly8x16_t a) {
   return vreinterpretq_f32_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_p16
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_p16(poly16x8_t a) {
   return vreinterpretq_f32_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_s8
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s8(<16 x i8> %a) #0 {
+// CHECK:   ret <16 x i8> %a
 poly8x16_t test_vreinterpretq_p8_s8(int8x16_t a) {
   return vreinterpretq_p8_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_s16
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_s16(int16x8_t a) {
   return vreinterpretq_p8_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_s32
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_s32(int32x4_t a) {
   return vreinterpretq_p8_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_s64
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_s64(int64x2_t a) {
   return vreinterpretq_p8_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_u8
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u8(<16 x i8> %a) #0 {
+// CHECK:   ret <16 x i8> %a
 poly8x16_t test_vreinterpretq_p8_u8(uint8x16_t a) {
   return vreinterpretq_p8_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_u16
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_u16(uint16x8_t a) {
   return vreinterpretq_p8_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_u32
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_u32(uint32x4_t a) {
   return vreinterpretq_p8_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_u64
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_u64(uint64x2_t a) {
   return vreinterpretq_p8_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_f16
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_f16(float16x8_t a) {
   return vreinterpretq_p8_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_f32
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_f32(float32x4_t a) {
   return vreinterpretq_p8_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_p16
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_p16(poly16x8_t a) {
   return vreinterpretq_p8_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_s8
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_s8(int8x16_t a) {
   return vreinterpretq_p16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_s16
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s16(<8 x i16> %a) #0 {
+// CHECK:   ret <8 x i16> %a
 poly16x8_t test_vreinterpretq_p16_s16(int16x8_t a) {
   return vreinterpretq_p16_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_s32
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_s32(int32x4_t a) {
   return vreinterpretq_p16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_s64
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_s64(int64x2_t a) {
   return vreinterpretq_p16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_u8
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_u8(uint8x16_t a) {
   return vreinterpretq_p16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_u16
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u16(<8 x i16> %a) #0 {
+// CHECK:   ret <8 x i16> %a
 poly16x8_t test_vreinterpretq_p16_u16(uint16x8_t a) {
   return vreinterpretq_p16_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_u32
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_u32(uint32x4_t a) {
   return vreinterpretq_p16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_u64
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_u64(uint64x2_t a) {
   return vreinterpretq_p16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_f16
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_f16(float16x8_t a) {
   return vreinterpretq_p16_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_f32
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_f32(float32x4_t a) {
   return vreinterpretq_p16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_p8
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_p8(poly8x16_t a) {
   return vreinterpretq_p16_p8(a);
 }
 
 
-// CHECK-LABEL: test_vrev16_s8
-// CHECK: vrev16.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vrev16_s8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vrev16_s8(int8x8_t a) {
   return vrev16_s8(a);
 }
 
-// CHECK-LABEL: test_vrev16_u8
-// CHECK: vrev16.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vrev16_u8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vrev16_u8(uint8x8_t a) {
   return vrev16_u8(a);
 }
 
-// CHECK-LABEL: test_vrev16_p8
-// CHECK: vrev16.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vrev16_p8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vrev16_p8(poly8x8_t a) {
   return vrev16_p8(a);
 }
 
-// CHECK-LABEL: test_vrev16q_s8
-// CHECK: vrev16.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vrev16q_s8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vrev16q_s8(int8x16_t a) {
   return vrev16q_s8(a);
 }
 
-// CHECK-LABEL: test_vrev16q_u8
-// CHECK: vrev16.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vrev16q_u8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vrev16q_u8(uint8x16_t a) {
   return vrev16q_u8(a);
 }
 
-// CHECK-LABEL: test_vrev16q_p8
-// CHECK: vrev16.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vrev16q_p8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vrev16q_p8(poly8x16_t a) {
   return vrev16q_p8(a);
 }
 
 
-// CHECK-LABEL: test_vrev32_s8
-// CHECK: vrev32.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vrev32_s8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vrev32_s8(int8x8_t a) {
   return vrev32_s8(a);
 }
 
-// CHECK-LABEL: test_vrev32_s16
-// CHECK: vrev32.16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vrev32_s16(<4 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vrev32_s16(int16x4_t a) {
   return vrev32_s16(a);
 }
 
-// CHECK-LABEL: test_vrev32_u8
-// CHECK: vrev32.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vrev32_u8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vrev32_u8(uint8x8_t a) {
   return vrev32_u8(a);
 }
 
-// CHECK-LABEL: test_vrev32_u16
-// CHECK: vrev32.16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vrev32_u16(<4 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vrev32_u16(uint16x4_t a) {
   return vrev32_u16(a);
 }
 
-// CHECK-LABEL: test_vrev32_p8
-// CHECK: vrev32.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vrev32_p8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vrev32_p8(poly8x8_t a) {
   return vrev32_p8(a);
 }
 
-// CHECK-LABEL: test_vrev32_p16
-// CHECK: vrev32.16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vrev32_p16(<4 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vrev32_p16(poly16x4_t a) {
   return vrev32_p16(a);
 }
 
-// CHECK-LABEL: test_vrev32q_s8
-// CHECK: vrev32.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vrev32q_s8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vrev32q_s8(int8x16_t a) {
   return vrev32q_s8(a);
 }
 
-// CHECK-LABEL: test_vrev32q_s16
-// CHECK: vrev32.16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vrev32q_s16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vrev32q_s16(int16x8_t a) {
   return vrev32q_s16(a);
 }
 
-// CHECK-LABEL: test_vrev32q_u8
-// CHECK: vrev32.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vrev32q_u8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vrev32q_u8(uint8x16_t a) {
   return vrev32q_u8(a);
 }
 
-// CHECK-LABEL: test_vrev32q_u16
-// CHECK: vrev32.16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vrev32q_u16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vrev32q_u16(uint16x8_t a) {
   return vrev32q_u16(a);
 }
 
-// CHECK-LABEL: test_vrev32q_p8
-// CHECK: vrev32.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vrev32q_p8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vrev32q_p8(poly8x16_t a) {
   return vrev32q_p8(a);
 }
 
-// CHECK-LABEL: test_vrev32q_p16
-// CHECK: vrev32.16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vrev32q_p16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vrev32q_p16(poly16x8_t a) {
   return vrev32q_p16(a);
 }
 
 
-// CHECK-LABEL: test_vrev64_s8
-// CHECK: vrev64.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vrev64_s8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vrev64_s8(int8x8_t a) {
   return vrev64_s8(a);
 }
 
-// CHECK-LABEL: test_vrev64_s16
-// CHECK: vrev64.16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vrev64_s16(<4 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vrev64_s16(int16x4_t a) {
   return vrev64_s16(a);
 }
 
-// CHECK-LABEL: test_vrev64_s32
-// CHECK: vrev64.32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vrev64_s32(<2 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 1, i32 0>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vrev64_s32(int32x2_t a) {
   return vrev64_s32(a);
 }
 
-// CHECK-LABEL: test_vrev64_u8
-// CHECK: vrev64.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vrev64_u8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vrev64_u8(uint8x8_t a) {
   return vrev64_u8(a);
 }
 
-// CHECK-LABEL: test_vrev64_u16
-// CHECK: vrev64.16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vrev64_u16(<4 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vrev64_u16(uint16x4_t a) {
   return vrev64_u16(a);
 }
 
-// CHECK-LABEL: test_vrev64_u32
-// CHECK: vrev64.32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vrev64_u32(<2 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 1, i32 0>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vrev64_u32(uint32x2_t a) {
   return vrev64_u32(a);
 }
 
-// CHECK-LABEL: test_vrev64_p8
-// CHECK: vrev64.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vrev64_p8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vrev64_p8(poly8x8_t a) {
   return vrev64_p8(a);
 }
 
-// CHECK-LABEL: test_vrev64_p16
-// CHECK: vrev64.16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vrev64_p16(<4 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vrev64_p16(poly16x4_t a) {
   return vrev64_p16(a);
 }
 
-// CHECK-LABEL: test_vrev64_f32
-// CHECK: vrev64.32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vrev64_f32(<2 x float> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %a, <2 x i32> <i32 1, i32 0>
+// CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vrev64_f32(float32x2_t a) {
   return vrev64_f32(a);
 }
 
-// CHECK-LABEL: test_vrev64q_s8
-// CHECK: vrev64.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vrev64q_s8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vrev64q_s8(int8x16_t a) {
   return vrev64q_s8(a);
 }
 
-// CHECK-LABEL: test_vrev64q_s16
-// CHECK: vrev64.16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vrev64q_s16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vrev64q_s16(int16x8_t a) {
   return vrev64q_s16(a);
 }
 
-// CHECK-LABEL: test_vrev64q_s32
-// CHECK: vrev64.32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vrev64q_s32(<4 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vrev64q_s32(int32x4_t a) {
   return vrev64q_s32(a);
 }
 
-// CHECK-LABEL: test_vrev64q_u8
-// CHECK: vrev64.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vrev64q_u8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vrev64q_u8(uint8x16_t a) {
   return vrev64q_u8(a);
 }
 
-// CHECK-LABEL: test_vrev64q_u16
-// CHECK: vrev64.16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vrev64q_u16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vrev64q_u16(uint16x8_t a) {
   return vrev64q_u16(a);
 }
 
-// CHECK-LABEL: test_vrev64q_u32
-// CHECK: vrev64.32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vrev64q_u32(<4 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vrev64q_u32(uint32x4_t a) {
   return vrev64q_u32(a);
 }
 
-// CHECK-LABEL: test_vrev64q_p8
-// CHECK: vrev64.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vrev64q_p8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vrev64q_p8(poly8x16_t a) {
   return vrev64q_p8(a);
 }
 
-// CHECK-LABEL: test_vrev64q_p16
-// CHECK: vrev64.16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vrev64q_p16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vrev64q_p16(poly16x8_t a) {
   return vrev64q_p16(a);
 }
 
-// CHECK-LABEL: test_vrev64q_f32
-// CHECK: vrev64.32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vrev64q_f32(<4 x float> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:   ret <4 x float> [[SHUFFLE_I]]
 float32x4_t test_vrev64q_f32(float32x4_t a) {
   return vrev64q_f32(a);
 }
 
 
-// CHECK-LABEL: test_vrhadd_s8
-// CHECK: vrhadd.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vrhadd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VRHADD_V_I]]
 int8x8_t test_vrhadd_s8(int8x8_t a, int8x8_t b) {
   return vrhadd_s8(a, b);
 }
 
-// CHECK-LABEL: test_vrhadd_s16
-// CHECK: vrhadd.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vrhadd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16> [[VRHADD_V_I]], <4 x i16> [[VRHADD_V1_I]]) #4
+// CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vrhadd_s16(int16x4_t a, int16x4_t b) {
   return vrhadd_s16(a, b);
 }
 
-// CHECK-LABEL: test_vrhadd_s32
-// CHECK: vrhadd.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vrhadd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32> [[VRHADD_V_I]], <2 x i32> [[VRHADD_V1_I]]) #4
+// CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vrhadd_s32(int32x2_t a, int32x2_t b) {
   return vrhadd_s32(a, b);
 }
 
-// CHECK-LABEL: test_vrhadd_u8
-// CHECK: vrhadd.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vrhadd_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VRHADD_V_I]]
 uint8x8_t test_vrhadd_u8(uint8x8_t a, uint8x8_t b) {
   return vrhadd_u8(a, b);
 }
 
-// CHECK-LABEL: test_vrhadd_u16
-// CHECK: vrhadd.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vrhadd_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.urhadd.v4i16(<4 x i16> [[VRHADD_V_I]], <4 x i16> [[VRHADD_V1_I]]) #4
+// CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vrhadd_u16(uint16x4_t a, uint16x4_t b) {
   return vrhadd_u16(a, b);
 }
 
-// CHECK-LABEL: test_vrhadd_u32
-// CHECK: vrhadd.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vrhadd_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.urhadd.v2i32(<2 x i32> [[VRHADD_V_I]], <2 x i32> [[VRHADD_V1_I]]) #4
+// CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vrhadd_u32(uint32x2_t a, uint32x2_t b) {
   return vrhadd_u32(a, b);
 }
 
-// CHECK-LABEL: test_vrhaddq_s8
-// CHECK: vrhadd.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vrhaddq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.srhadd.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VRHADDQ_V_I]]
 int8x16_t test_vrhaddq_s8(int8x16_t a, int8x16_t b) {
   return vrhaddq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vrhaddq_s16
-// CHECK: vrhadd.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vrhaddq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> [[VRHADDQ_V_I]], <8 x i16> [[VRHADDQ_V1_I]]) #4
+// CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vrhaddq_s16(int16x8_t a, int16x8_t b) {
   return vrhaddq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vrhaddq_s32
-// CHECK: vrhadd.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vrhaddq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32> [[VRHADDQ_V_I]], <4 x i32> [[VRHADDQ_V1_I]]) #4
+// CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vrhaddq_s32(int32x4_t a, int32x4_t b) {
   return vrhaddq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vrhaddq_u8
-// CHECK: vrhadd.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vrhaddq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.urhadd.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VRHADDQ_V_I]]
 uint8x16_t test_vrhaddq_u8(uint8x16_t a, uint8x16_t b) {
   return vrhaddq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vrhaddq_u16
-// CHECK: vrhadd.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vrhaddq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> [[VRHADDQ_V_I]], <8 x i16> [[VRHADDQ_V1_I]]) #4
+// CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vrhaddq_u16(uint16x8_t a, uint16x8_t b) {
   return vrhaddq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vrhaddq_u32
-// CHECK: vrhadd.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vrhaddq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32> [[VRHADDQ_V_I]], <4 x i32> [[VRHADDQ_V1_I]]) #4
+// CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vrhaddq_u32(uint32x4_t a, uint32x4_t b) {
   return vrhaddq_u32(a, b);
 }
 
 
-// CHECK-LABEL: test_vrshl_s8
-// CHECK: vrshl.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vrshl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VRSHL_V_I]]
 int8x8_t test_vrshl_s8(int8x8_t a, int8x8_t b) {
   return vrshl_s8(a, b);
 }
 
-// CHECK-LABEL: test_vrshl_s16
-// CHECK: vrshl.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vrshl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> [[VRSHL_V_I]], <4 x i16> [[VRSHL_V1_I]]) #4
+// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vrshl_s16(int16x4_t a, int16x4_t b) {
   return vrshl_s16(a, b);
 }
 
-// CHECK-LABEL: test_vrshl_s32
-// CHECK: vrshl.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vrshl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> [[VRSHL_V_I]], <2 x i32> [[VRSHL_V1_I]]) #4
+// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vrshl_s32(int32x2_t a, int32x2_t b) {
   return vrshl_s32(a, b);
 }
 
-// CHECK-LABEL: test_vrshl_s64

[... 9388 lines stripped ...]